From: Linus Torvalds Date: Sun, 11 Nov 2018 22:53:02 +0000 (-0600) Subject: Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git... X-Git-Tag: v4.20-rc2~4 X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=c140f8b072d16595c83d4d16a05693e72d9b1973;hp=eb6984fa4ce2837dcb1f66720a600f31b0bb3739 Merge tag 'ext4_for_linus_stable' of git://git./linux/kernel/git/tytso/ext4 Pull ext4 fixes from Ted Ts'o: "A large number of ext4 bug fixes, mostly buffer and memory leaks on error return cleanup paths" * tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: missing !bh check in ext4_xattr_inode_write() ext4: fix buffer leak in __ext4_read_dirblock() on error path ext4: fix buffer leak in ext4_expand_extra_isize_ea() on error path ext4: fix buffer leak in ext4_xattr_move_to_block() on error path ext4: release bs.bh before re-using in ext4_xattr_block_find() ext4: fix buffer leak in ext4_xattr_get_block() on error path ext4: fix possible leak of s_journal_flag_rwsem in error path ext4: fix possible leak of sbi->s_group_desc_leak in error path ext4: remove unneeded brelse call in ext4_xattr_inode_update_ref() ext4: avoid possible double brelse() in add_new_gdb() on error path ext4: avoid buffer leak in ext4_orphan_add() after prior errors ext4: avoid buffer leak on shutdown in ext4_mark_iloc_dirty() ext4: fix possible inode leak in the retry loop of ext4_resize_fs() ext4: fix missing cleanup if ext4_alloc_flex_bg_array() fails while resizing ext4: add missing brelse() update_backups()'s error path ext4: add missing brelse() add_new_gdb_meta_bg()'s error path ext4: add missing brelse() in set_flexbg_block_bitmap()'s error path ext4: avoid potential extra brelse in setup_new_flex_group_blocks() --- diff --git a/.clang-format b/.clang-format index 1d5da22e0ba5..e6080f5834a3 100644 --- a/.clang-format +++ b/.clang-format @@ -323,7 +323,6 @@ ForEachMacros: - 'protocol_for_each_card' - 'protocol_for_each_dev' - 'queue_for_each_hw_ctx' - - 'radix_tree_for_each_contig' - 'radix_tree_for_each_slot' - 'radix_tree_for_each_tagged' - 'rbtree_postorder_for_each_entry_safe' diff --git a/.mailmap b/.mailmap index 285e09645b31..28fecafa6506 100644 --- a/.mailmap +++ b/.mailmap @@ -119,6 +119,13 @@ Mark Brown Mark Yao Martin Kepplinger Martin Kepplinger +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox Matthieu CASTET Mauro Carvalho Chehab Mauro Carvalho Chehab @@ -152,7 +159,13 @@ Peter Oruba Peter Oruba Pratyush Anand Praveen BP +Punit Agrawal Qais Yousef +Oleksij Rempel +Oleksij Rempel +Oleksij Rempel +Oleksij Rempel +Oleksij Rempel Rajesh Shah Ralf Baechle Ralf Wildenhues diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX deleted file mode 100644 index 2754fe83f0d4..000000000000 --- a/Documentation/00-INDEX +++ /dev/null @@ -1,428 +0,0 @@ - -This is a brief list of all the files in ./linux/Documentation and what -they contain. If you add a documentation file, please list it here in -alphabetical order as well, or risk being hunted down like a rabid dog. -Please keep the descriptions small enough to fit on one line. - Thanks -- Paul G. - -Following translations are available on the WWW: - - - Japanese, maintained by the JF Project (jf@listserv.linux.or.jp), at - http://linuxjf.sourceforge.jp/ - -00-INDEX - - this file. -ABI/ - - info on kernel <-> userspace ABI and relative interface stability. -CodingStyle - - nothing here, just a pointer to process/coding-style.rst. -DMA-API.txt - - DMA API, pci_ API & extensions for non-consistent memory machines. -DMA-API-HOWTO.txt - - Dynamic DMA mapping Guide -DMA-ISA-LPC.txt - - How to do DMA with ISA (and LPC) devices. -DMA-attributes.txt - - listing of the various possible attributes a DMA region can have -EDID/ - - directory with info on customizing EDID for broken gfx/displays. -IPMI.txt - - info on Linux Intelligent Platform Management Interface (IPMI) Driver. -IRQ-affinity.txt - - how to select which CPU(s) handle which interrupt events on SMP. -IRQ-domain.txt - - info on interrupt numbering and setting up IRQ domains. -IRQ.txt - - description of what an IRQ is. -Intel-IOMMU.txt - - basic info on the Intel IOMMU virtualization support. -Makefile - - It's not of interest for those who aren't touching the build system. -PCI/ - - info related to PCI drivers. -RCU/ - - directory with info on RCU (read-copy update). -SAK.txt - - info on Secure Attention Keys. -SM501.txt - - Silicon Motion SM501 multimedia companion chip -SubmittingPatches - - nothing here, just a pointer to process/coding-style.rst. -accounting/ - - documentation on accounting and taskstats. -acpi/ - - info on ACPI-specific hooks in the kernel. -admin-guide/ - - info related to Linux users and system admins. -aoe/ - - description of AoE (ATA over Ethernet) along with config examples. -arm/ - - directory with info about Linux on the ARM architecture. -arm64/ - - directory with info about Linux on the 64 bit ARM architecture. -auxdisplay/ - - misc. LCD driver documentation (cfag12864b, ks0108). -backlight/ - - directory with info on controlling backlights in flat panel displays -block/ - - info on the Block I/O (BIO) layer. -blockdev/ - - info on block devices & drivers -bt8xxgpio.txt - - info on how to modify a bt8xx video card for GPIO usage. -btmrvl.txt - - info on Marvell Bluetooth driver usage. -bus-devices/ - - directory with info on TI GPMC (General Purpose Memory Controller) -bus-virt-phys-mapping.txt - - how to access I/O mapped memory from within device drivers. -cdrom/ - - directory with information on the CD-ROM drivers that Linux has. -cgroup-v1/ - - cgroups v1 features, including cpusets and memory controller. -cma/ - - Continuous Memory Area (CMA) debugfs interface. -conf.py - - It's not of interest for those who aren't touching the build system. -connector/ - - docs on the netlink based userspace<->kernel space communication mod. -console/ - - documentation on Linux console drivers. -core-api/ - - documentation on kernel core components. -cpu-freq/ - - info on CPU frequency and voltage scaling. -cpu-hotplug.txt - - document describing CPU hotplug support in the Linux kernel. -cpu-load.txt - - document describing how CPU load statistics are collected. -cpuidle/ - - info on CPU_IDLE, CPU idle state management subsystem. -cputopology.txt - - documentation on how CPU topology info is exported via sysfs. -crc32.txt - - brief tutorial on CRC computation -crypto/ - - directory with info on the Crypto API. -dcdbas.txt - - information on the Dell Systems Management Base Driver. -debugging-modules.txt - - some notes on debugging modules after Linux 2.6.3. -debugging-via-ohci1394.txt - - how to use firewire like a hardware debugger memory reader. -dell_rbu.txt - - document demonstrating the use of the Dell Remote BIOS Update driver. -dev-tools/ - - directory with info on development tools for the kernel. -device-mapper/ - - directory with info on Device Mapper. -dmaengine/ - - the DMA engine and controller API guides. -devicetree/ - - directory with info on device tree files used by OF/PowerPC/ARM -digsig.txt - -info on the Digital Signature Verification API -dma-buf-sharing.txt - - the DMA Buffer Sharing API Guide -docutils.conf - - nothing here. Just a configuration file for docutils. -dontdiff - - file containing a list of files that should never be diff'ed. -driver-api/ - - the Linux driver implementer's API guide. -driver-model/ - - directory with info about Linux driver model. -early-userspace/ - - info about initramfs, klibc, and userspace early during boot. -efi-stub.txt - - How to use the EFI boot stub to bypass GRUB or elilo on EFI systems. -eisa.txt - - info on EISA bus support. -extcon/ - - directory with porting guide for Android kernel switch driver. -isa.txt - - info on EISA bus support. -fault-injection/ - - dir with docs about the fault injection capabilities infrastructure. -fb/ - - directory with info on the frame buffer graphics abstraction layer. -features/ - - status of feature implementation on different architectures. -filesystems/ - - info on the vfs and the various filesystems that Linux supports. -firmware_class/ - - request_firmware() hotplug interface info. -flexible-arrays.txt - - how to make use of flexible sized arrays in linux -fmc/ - - information about the FMC bus abstraction -fpga/ - - FPGA Manager Core. -futex-requeue-pi.txt - - info on requeueing of tasks from a non-PI futex to a PI futex -gcc-plugins.txt - - GCC plugin infrastructure. -gpio/ - - gpio related documentation -gpu/ - - directory with information on GPU driver developer's guide. -hid/ - - directory with information on human interface devices -highuid.txt - - notes on the change from 16 bit to 32 bit user/group IDs. -hwspinlock.txt - - hardware spinlock provides hardware assistance for synchronization -timers/ - - info on the timer related topics -hw_random.txt - - info on Linux support for random number generator in i8xx chipsets. -hwmon/ - - directory with docs on various hardware monitoring drivers. -i2c/ - - directory with info about the I2C bus/protocol (2 wire, kHz speed). -x86/i386/ - - directory with info about Linux on Intel 32 bit architecture. -ia64/ - - directory with info about Linux on Intel 64 bit architecture. -ide/ - - Information regarding the Enhanced IDE drive. -iio/ - - info on industrial IIO configfs support. -index.rst - - main index for the documentation at ReST format. -infiniband/ - - directory with documents concerning Linux InfiniBand support. -input/ - - info on Linux input device support. -intel_txt.txt - - info on intel Trusted Execution Technology (intel TXT). -io-mapping.txt - - description of io_mapping functions in linux/io-mapping.h -io_ordering.txt - - info on ordering I/O writes to memory-mapped addresses. -ioctl/ - - directory with documents describing various IOCTL calls. -iostats.txt - - info on I/O statistics Linux kernel provides. -irqflags-tracing.txt - - how to use the irq-flags tracing feature. -isapnp.txt - - info on Linux ISA Plug & Play support. -isdn/ - - directory with info on the Linux ISDN support, and supported cards. -kbuild/ - - directory with info about the kernel build process. -kdump/ - - directory with mini HowTo on getting the crash dump code to work. -doc-guide/ - - how to write and format reStructuredText kernel documentation -kernel-per-CPU-kthreads.txt - - List of all per-CPU kthreads and how they introduce jitter. -kobject.txt - - info of the kobject infrastructure of the Linux kernel. -kprobes.txt - - documents the kernel probes debugging feature. -kref.txt - - docs on adding reference counters (krefs) to kernel objects. -laptops/ - - directory with laptop related info and laptop driver documentation. -ldm.txt - - a brief description of LDM (Windows Dynamic Disks). -leds/ - - directory with info about LED handling under Linux. -livepatch/ - - info on kernel live patching. -locking/ - - directory with info about kernel locking primitives -lockup-watchdogs.txt - - info on soft and hard lockup detectors (aka nmi_watchdog). -logo.gif - - full colour GIF image of Linux logo (penguin - Tux). -logo.txt - - info on creator of above logo & site to get additional images from. -lsm.txt - - Linux Security Modules: General Security Hooks for Linux -lzo.txt - - kernel LZO decompressor input formats -m68k/ - - directory with info about Linux on Motorola 68k architecture. -mailbox.txt - - How to write drivers for the common mailbox framework (IPC). -md/ - - directory with info about Linux Software RAID -media/ - - info on media drivers: uAPI, kAPI and driver documentation. -memory-barriers.txt - - info on Linux kernel memory barriers. -memory-devices/ - - directory with info on parts like the Texas Instruments EMIF driver -memory-hotplug.txt - - Hotpluggable memory support, how to use and current status. -men-chameleon-bus.txt - - info on MEN chameleon bus. -mic/ - - Intel Many Integrated Core (MIC) architecture device driver. -mips/ - - directory with info about Linux on MIPS architecture. -misc-devices/ - - directory with info about devices using the misc dev subsystem -mmc/ - - directory with info about the MMC subsystem -mtd/ - - directory with info about memory technology devices (flash) -namespaces/ - - directory with various information about namespaces -netlabel/ - - directory with information on the NetLabel subsystem. -networking/ - - directory with info on various aspects of networking with Linux. -nfc/ - - directory relating info about Near Field Communications support. -nios2/ - - Linux on the Nios II architecture. -nommu-mmap.txt - - documentation about no-mmu memory mapping support. -numastat.txt - - info on how to read Numa policy hit/miss statistics in sysfs. -ntb.txt - - info on Non-Transparent Bridge (NTB) drivers. -nvdimm/ - - info on non-volatile devices. -nvmem/ - - info on non volatile memory framework. -output/ - - default directory where html/LaTeX/pdf files will be written. -padata.txt - - An introduction to the "padata" parallel execution API -parisc/ - - directory with info on using Linux on PA-RISC architecture. -parport-lowlevel.txt - - description and usage of the low level parallel port functions. -pcmcia/ - - info on the Linux PCMCIA driver. -percpu-rw-semaphore.txt - - RCU based read-write semaphore optimized for locking for reading -perf/ - - info about the APM X-Gene SoC Performance Monitoring Unit (PMU). -phy/ - - ino on Samsung USB 2.0 PHY adaptation layer. -phy.txt - - Description of the generic PHY framework. -pi-futex.txt - - documentation on lightweight priority inheritance futexes. -pinctrl.txt - - info on pinctrl subsystem and the PINMUX/PINCONF and drivers -platform/ - - List of supported hardware by compal and Dell laptop. -pnp.txt - - Linux Plug and Play documentation. -power/ - - directory with info on Linux PCI power management. -powerpc/ - - directory with info on using Linux with the PowerPC. -prctl/ - - directory with info on the priveledge control subsystem -preempt-locking.txt - - info on locking under a preemptive kernel. -process/ - - how to work with the mainline kernel development process. -pps/ - - directory with information on the pulse-per-second support -pti/ - - directory with info on Intel MID PTI. -ptp/ - - directory with info on support for IEEE 1588 PTP clocks in Linux. -pwm.txt - - info on the pulse width modulation driver subsystem -rapidio/ - - directory with info on RapidIO packet-based fabric interconnect -rbtree.txt - - info on what red-black trees are and what they are for. -remoteproc.txt - - info on how to handle remote processor (e.g. AMP) offloads/usage. -rfkill.txt - - info on the radio frequency kill switch subsystem/support. -robust-futex-ABI.txt - - documentation of the robust futex ABI. -robust-futexes.txt - - a description of what robust futexes are. -rpmsg.txt - - info on the Remote Processor Messaging (rpmsg) Framework -rtc.txt - - notes on how to use the Real Time Clock (aka CMOS clock) driver. -s390/ - - directory with info on using Linux on the IBM S390. -scheduler/ - - directory with info on the scheduler. -scsi/ - - directory with info on Linux scsi support. -security/ - - directory that contains security-related info -serial/ - - directory with info on the low level serial API. -sgi-ioc4.txt - - description of the SGI IOC4 PCI (multi function) device. -sh/ - - directory with info on porting Linux to a new architecture. -smsc_ece1099.txt - -info on the smsc Keyboard Scan Expansion/GPIO Expansion device. -sound/ - - directory with info on sound card support. -spi/ - - overview of Linux kernel Serial Peripheral Interface (SPI) support. -sphinx/ - - no documentation here, just files required by Sphinx toolchain. -sphinx-static/ - - no documentation here, just files required by Sphinx toolchain. -static-keys.txt - - info on how static keys allow debug code in hotpaths via patching -svga.txt - - short guide on selecting video modes at boot via VGA BIOS. -sync_file.txt - - Sync file API guide. -sysctl/ - - directory with info on the /proc/sys/* files. -target/ - - directory with info on generating TCM v4 fabric .ko modules -tee.txt - - info on the TEE subsystem and drivers -this_cpu_ops.txt - - List rationale behind and the way to use this_cpu operations. -thermal/ - - directory with information on managing thermal issues (CPU/temp) -trace/ - - directory with info on tracing technologies within linux -translations/ - - translations of this document from English to another language -unaligned-memory-access.txt - - info on how to avoid arch breaking unaligned memory access in code. -unshare.txt - - description of the Linux unshare system call. -usb/ - - directory with info regarding the Universal Serial Bus. -vfio.txt - - info on Virtual Function I/O used in guest/hypervisor instances. -video-output.txt - - sysfs class driver interface to enable/disable a video output device. -virtual/ - - directory with information on the various linux virtualizations. -vm/ - - directory with info on the Linux vm code. -w1/ - - directory with documents regarding the 1-wire (w1) subsystem. -watchdog/ - - how to auto-reboot Linux if it has "fallen and can't get up". ;-) -wimax/ - - directory with info about Intel Wireless Wimax Connections -core-api/workqueue.rst - - information on the Concurrency Managed Workqueue implementation -x86/x86_64/ - - directory with info on Linux support for AMD x86-64 (Hammer) machines. -xillybus.txt - - Overview and basic ui of xillybus driver -xtensa/ - - directory with documents relating to arch/xtensa port/implementation -xz.txt - - how to make use of the XZ data compression within linux kernel -zorro.txt - - info on writing drivers for Zorro bus devices found on Amigas. diff --git a/Documentation/ABI/stable/sysfs-driver-usb-usbtmc b/Documentation/ABI/stable/sysfs-driver-usb-usbtmc index e960cd027e1e..a9e123ba32cd 100644 --- a/Documentation/ABI/stable/sysfs-driver-usb-usbtmc +++ b/Documentation/ABI/stable/sysfs-driver-usb-usbtmc @@ -25,38 +25,3 @@ Description: 4.2.2. The files are read only. - - -What: /sys/bus/usb/drivers/usbtmc/*/TermChar -Date: August 2008 -Contact: Greg Kroah-Hartman -Description: - This file is the TermChar value to be sent to the USB TMC - device as described by the document, "Universal Serial Bus Test - and Measurement Class Specification - (USBTMC) Revision 1.0" as published by the USB-IF. - - Note that the TermCharEnabled file determines if this value is - sent to the device or not. - - -What: /sys/bus/usb/drivers/usbtmc/*/TermCharEnabled -Date: August 2008 -Contact: Greg Kroah-Hartman -Description: - This file determines if the TermChar is to be sent to the - device on every transaction or not. For more details about - this, please see the document, "Universal Serial Bus Test and - Measurement Class Specification (USBTMC) Revision 1.0" as - published by the USB-IF. - - -What: /sys/bus/usb/drivers/usbtmc/*/auto_abort -Date: August 2008 -Contact: Greg Kroah-Hartman -Description: - This file determines if the transaction of the USB TMC - device is to be automatically aborted if there is any error. - For more details about this, please see the document, - "Universal Serial Bus Test and Measurement Class Specification - (USBTMC) Revision 1.0" as published by the USB-IF. diff --git a/Documentation/ABI/testing/configfs-stp-policy-p_sys-t b/Documentation/ABI/testing/configfs-stp-policy-p_sys-t new file mode 100644 index 000000000000..b290d1c00dcf --- /dev/null +++ b/Documentation/ABI/testing/configfs-stp-policy-p_sys-t @@ -0,0 +1,41 @@ +What: /config/stp-policy/:p_sys-t.//uuid +Date: June 2018 +KernelVersion: 4.19 +Description: + UUID source identifier string, RW. + Default value is randomly generated at the mkdir time. + Data coming from trace sources that use this will be + tagged with this UUID in the MIPI SyS-T packet stream, to + allow the decoder to discern between different sources + within the same master/channel range, and identify the + higher level decoders that may be needed for each source. + +What: /config/stp-policy/:p_sys-t.//do_len +Date: June 2018 +KernelVersion: 4.19 +Description: + Include payload length in the MIPI SyS-T header, boolean. + If enabled, the SyS-T protocol encoder will include payload + length in each packet's metadata. This is normally redundant + if the underlying transport protocol supports marking message + boundaries (which STP does), so this is off by default. + +What: /config/stp-policy/:p_sys-t.//ts_interval +Date: June 2018 +KernelVersion: 4.19 +Description: + Time interval in milliseconds. Include a timestamp in the + MIPI SyS-T packet metadata, if this many milliseconds have + passed since the previous packet from this source. Zero is + the default and stands for "never send the timestamp". + +What: /config/stp-policy/:p_sys-t.//clocksync_interval +Date: June 2018 +KernelVersion: 4.19 +Description: + Time interval in milliseconds. Send a CLOCKSYNC packet if + this many milliseconds have passed since the previous + CLOCKSYNC packet from this source. Zero is the default and + stands for "never send the CLOCKSYNC". It makes sense to + use this option with sources that generate constant and/or + periodic data, like stm_heartbeat. diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uvc b/Documentation/ABI/testing/configfs-usb-gadget-uvc index 9281e2aa38df..809765bd9573 100644 --- a/Documentation/ABI/testing/configfs-usb-gadget-uvc +++ b/Documentation/ABI/testing/configfs-usb-gadget-uvc @@ -12,6 +12,10 @@ Date: Dec 2014 KernelVersion: 4.0 Description: Control descriptors + All attributes read only: + bInterfaceNumber - USB interface number for this + streaming interface + What: /config/usb-gadget/gadget/functions/uvc.name/control/class Date: Dec 2014 KernelVersion: 4.0 @@ -109,6 +113,10 @@ Date: Dec 2014 KernelVersion: 4.0 Description: Streaming descriptors + All attributes read only: + bInterfaceNumber - USB interface number for this + streaming interface + What: /config/usb-gadget/gadget/functions/uvc.name/streaming/class Date: Dec 2014 KernelVersion: 4.0 @@ -160,6 +168,10 @@ Description: Specific MJPEG format descriptors All attributes read only, except bmaControls and bDefaultFrameIndex: + bFormatIndex - unique id for this format descriptor; + only defined after parent header is + linked into the streaming class; + read-only bmaControls - this format's data for bmaControls in the streaming header bmInterfaceFlags - specifies interlace information, @@ -177,6 +189,10 @@ Date: Dec 2014 KernelVersion: 4.0 Description: Specific MJPEG frame descriptors + bFrameIndex - unique id for this framedescriptor; + only defined after parent format is + linked into the streaming header; + read-only dwFrameInterval - indicates how frame interval can be programmed; a number of values separated by newline can be specified @@ -204,6 +220,10 @@ Date: Dec 2014 KernelVersion: 4.0 Description: Specific uncompressed format descriptors + bFormatIndex - unique id for this format descriptor; + only defined after parent header is + linked into the streaming class; + read-only bmaControls - this format's data for bmaControls in the streaming header bmInterfaceFlags - specifies interlace information, @@ -224,6 +244,10 @@ Date: Dec 2014 KernelVersion: 4.0 Description: Specific uncompressed frame descriptors + bFrameIndex - unique id for this framedescriptor; + only defined after parent format is + linked into the streaming header; + read-only dwFrameInterval - indicates how frame interval can be programmed; a number of values separated by newline can be specified diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index a5b4f223641d..8127a08e366d 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -199,7 +199,7 @@ Description: What: /sys/bus/iio/devices/iio:deviceX/in_positionrelative_x_raw What: /sys/bus/iio/devices/iio:deviceX/in_positionrelative_y_raw -KernelVersion: 4.18 +KernelVersion: 4.19 Contact: linux-iio@vger.kernel.org Description: Relative position in direction x or y on a pad (may be diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 44d4b2be92fd..8bfee557e50e 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -323,3 +323,27 @@ Description: This is similar to /sys/bus/pci/drivers_autoprobe, but affects only the VFs associated with a specific PF. + +What: /sys/bus/pci/devices/.../p2pmem/size +Date: November 2017 +Contact: Logan Gunthorpe +Description: + If the device has any Peer-to-Peer memory registered, this + file contains the total amount of memory that the device + provides (in decimal). + +What: /sys/bus/pci/devices/.../p2pmem/available +Date: November 2017 +Contact: Logan Gunthorpe +Description: + If the device has any Peer-to-Peer memory registered, this + file contains the amount of memory that has not been + allocated (in decimal). + +What: /sys/bus/pci/devices/.../p2pmem/published +Date: November 2017 +Contact: Logan Gunthorpe +Description: + If the device has any Peer-to-Peer memory registered, this + file contains a '1' if the memory has been published for + use outside the driver that owns the device. diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb index 08d456e07b53..559baa5c418c 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb +++ b/Documentation/ABI/testing/sysfs-bus-usb @@ -189,6 +189,16 @@ Description: The file will read "hotplug", "wired" and "not used" if the information is available, and "unknown" otherwise. +What: /sys/bus/usb/devices/.../(hub interface)/portX/location +Date: October 2018 +Contact: Bjørn Mork +Description: + Some platforms provide usb port physical location through + firmware. This is used by the kernel to pair up logical ports + mapping to the same physical connector. The attribute exposes the + raw location value as a hex integer. + + What: /sys/bus/usb/devices/.../(hub interface)/portX/quirks Date: May 2018 Contact: Nicolas Boichat @@ -219,7 +229,14 @@ Description: ports and report them to the kernel. This attribute is to expose the number of over-current situation occurred on a specific port to user space. This file will contain an unsigned 32 bit value - which wraps to 0 after its maximum is reached. + which wraps to 0 after its maximum is reached. This file supports + poll() for monitoring changes to this value in user space. + + Any time this value changes the corresponding hub device will send a + udev event with the following attributes: + + OVER_CURRENT_PORT=/sys/bus/usb/devices/.../(hub interface)/portX + OVER_CURRENT_COUNT=[current value of this sysfs attribute] What: /sys/bus/usb/devices/.../(hub interface)/portX/usb3_lpm_permit Date: November 2015 diff --git a/Documentation/ABI/testing/sysfs-bus-vmbus b/Documentation/ABI/testing/sysfs-bus-vmbus new file mode 100644 index 000000000000..91e6c065973c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-vmbus @@ -0,0 +1,21 @@ +What: /sys/bus/vmbus/devices/.../driver_override +Date: August 2019 +Contact: Stephen Hemminger +Description: + This file allows the driver for a device to be specified which + will override standard static and dynamic ID matching. When + specified, only a driver with a name matching the value written + to driver_override will have an opportunity to bind to the + device. The override is specified by writing a string to the + driver_override file (echo uio_hv_generic > driver_override) and + may be cleared with an empty string (echo > driver_override). + This returns the device to standard matching rules binding. + Writing to driver_override does not automatically unbind the + device from its current driver or make any attempt to + automatically load the specified driver. If no driver with a + matching name is currently loaded in the kernel, the device + will not bind to any driver. This also allows devices to + opt-out of driver binding using a driver_override name such as + "none". Only a single driver may be specified in the override, + there is no support for parsing delimiters. + diff --git a/Documentation/ABI/testing/sysfs-class-lcd-s6e63m0 b/Documentation/ABI/testing/sysfs-class-lcd-s6e63m0 deleted file mode 100644 index ae0a2d3dcc07..000000000000 --- a/Documentation/ABI/testing/sysfs-class-lcd-s6e63m0 +++ /dev/null @@ -1,27 +0,0 @@ -sysfs interface for the S6E63M0 AMOLED LCD panel driver -------------------------------------------------------- - -What: /sys/class/lcd//gamma_mode -Date: May, 2010 -KernelVersion: v2.6.35 -Contact: dri-devel@lists.freedesktop.org -Description: - (RW) Read or write the gamma mode. Following three modes are - supported: - 0 - gamma value 2.2, - 1 - gamma value 1.9 and - 2 - gamma value 1.7. - - -What: /sys/class/lcd//gamma_table -Date: May, 2010 -KernelVersion: v2.6.35 -Contact: dri-devel@lists.freedesktop.org -Description: - (RO) Displays the size of the gamma table i.e. the number of - gamma modes available. - -This is a backlight lcd driver. These interfaces are an extension to the API -documented in Documentation/ABI/testing/sysfs-class-lcd and in -Documentation/ABI/stable/sysfs-class-backlight (under -/sys/class/backlight//). diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-sc27xx b/Documentation/ABI/testing/sysfs-class-led-driver-sc27xx new file mode 100644 index 000000000000..45b1e605d355 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-driver-sc27xx @@ -0,0 +1,22 @@ +What: /sys/class/leds//hw_pattern +Date: September 2018 +KernelVersion: 4.20 +Description: + Specify a hardware pattern for the SC27XX LED. For the SC27XX + LED controller, it only supports 4 stages to make a single + hardware pattern, which is used to configure the rise time, + high time, fall time and low time for the breathing mode. + + For the breathing mode, the SC27XX LED only expects one brightness + for the high stage. To be compatible with the hardware pattern + format, we should set brightness as 0 for rise stage, fall + stage and low stage. + + Min stage duration: 125 ms + Max stage duration: 31875 ms + + Since the stage duration step is 125 ms, the duration should be + a multiplier of 125, like 125ms, 250ms, 375ms, 500ms ... 31875ms. + + Thus the format of the hardware pattern values should be: + "0 rise_duration brightness high_duration 0 fall_duration 0 low_duration". diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-pattern b/Documentation/ABI/testing/sysfs-class-led-trigger-pattern new file mode 100644 index 000000000000..1e5d172e0646 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-pattern @@ -0,0 +1,82 @@ +What: /sys/class/leds//pattern +Date: September 2018 +KernelVersion: 4.20 +Description: + Specify a software pattern for the LED, that supports altering + the brightness for the specified duration with one software + timer. It can do gradual dimming and step change of brightness. + + The pattern is given by a series of tuples, of brightness and + duration (ms). The LED is expected to traverse the series and + each brightness value for the specified duration. Duration of + 0 means brightness should immediately change to new value, and + writing malformed pattern deactivates any active one. + + 1. For gradual dimming, the dimming interval now is set as 50 + milliseconds. So the tuple with duration less than dimming + interval (50ms) is treated as a step change of brightness, + i.e. the subsequent brightness will be applied without adding + intervening dimming intervals. + + The gradual dimming format of the software pattern values should be: + "brightness_1 duration_1 brightness_2 duration_2 brightness_3 + duration_3 ...". For example: + + echo 0 1000 255 2000 > pattern + + It will make the LED go gradually from zero-intensity to max (255) + intensity in 1000 milliseconds, then back to zero intensity in 2000 + milliseconds: + + LED brightness + ^ + 255-| / \ / \ / + | / \ / \ / + | / \ / \ / + | / \ / \ / + 0-| / \/ \/ + +---0----1----2----3----4----5----6------------> time (s) + + 2. To make the LED go instantly from one brightness value to another, + we should use zero-time lengths (the brightness must be same as + the previous tuple's). So the format should be: + "brightness_1 duration_1 brightness_1 0 brightness_2 duration_2 + brightness_2 0 ...". For example: + + echo 0 1000 0 0 255 2000 255 0 > pattern + + It will make the LED stay off for one second, then stay at max brightness + for two seconds: + + LED brightness + ^ + 255-| +---------+ +---------+ + | | | | | + | | | | | + | | | | | + 0-| -----+ +----+ +---- + +---0----1----2----3----4----5----6------------> time (s) + +What: /sys/class/leds//hw_pattern +Date: September 2018 +KernelVersion: 4.20 +Description: + Specify a hardware pattern for the LED, for LED hardware that + supports autonomously controlling brightness over time, according + to some preprogrammed hardware patterns. It deactivates any active + software pattern. + + Since different LED hardware can have different semantics of + hardware patterns, each driver is expected to provide its own + description for the hardware patterns in their ABI documentation + file. + +What: /sys/class/leds//repeat +Date: September 2018 +KernelVersion: 4.20 +Description: + Specify a pattern repeat number. -1 means repeat indefinitely, + other negative numbers and number 0 are invalid. + + This file will always return the originally written repeat + number. diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 2f1788111cd9..664a8f6a634f 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -91,6 +91,24 @@ Description: stacked (e.g: VLAN interfaces) but still have the same MAC address as their parent device. +What: /sys/class/net//dev_port +Date: February 2014 +KernelVersion: 3.15 +Contact: netdev@vger.kernel.org +Description: + Indicates the port number of this network device, formatted + as a decimal value. Some NICs have multiple independent ports + on the same PCI bus, device and function. This attribute allows + userspace to distinguish the respective interfaces. + + Note: some device drivers started to use 'dev_id' for this + purpose since long before 3.15 and have not adopted the new + attribute ever since. To query the port number, some tools look + exclusively at 'dev_port', while others only consult 'dev_id'. + If a network device has multiple client adapter ports as + described in the previous paragraph and does not set this + attribute to its port number, it's a kernel bug. + What: /sys/class/net//dormant Date: March 2006 KernelVersion: 2.6.17 @@ -117,7 +135,7 @@ Description: full: full duplex Note: This attribute is only valid for interfaces that implement - the ethtool get_settings method (mostly Ethernet). + the ethtool get_link_ksettings method (mostly Ethernet). What: /sys/class/net//flags Date: April 2005 @@ -224,7 +242,7 @@ Description: an integer representing the link speed in Mbits/sec. Note: this attribute is only valid for interfaces that implement - the ethtool get_settings method (mostly Ethernet ). + the ethtool get_link_ksettings method (mostly Ethernet). What: /sys/class/net//tx_queue_len Date: April 2005 diff --git a/Documentation/ABI/testing/sysfs-class-net-dsa b/Documentation/ABI/testing/sysfs-class-net-dsa new file mode 100644 index 000000000000..f240221e071e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-net-dsa @@ -0,0 +1,7 @@ +What: /sys/class/net//tagging +Date: August 2018 +KernelVersion: 4.20 +Contact: netdev@vger.kernel.org +Description: + String indicating the type of tagging protocol used by the + DSA slave network device. diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 94a24aedcdb2..3ac41774ad3c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -121,7 +121,22 @@ What: /sys/fs/f2fs//idle_interval Date: January 2016 Contact: "Jaegeuk Kim" Description: - Controls the idle timing. + Controls the idle timing for all paths other than + discard and gc path. + +What: /sys/fs/f2fs//discard_idle_interval +Date: September 2018 +Contact: "Chao Yu" +Contact: "Sahitya Tummala" +Description: + Controls the idle timing for discard path. + +What: /sys/fs/f2fs//gc_idle_interval +Date: September 2018 +Contact: "Chao Yu" +Contact: "Sahitya Tummala" +Description: + Controls the idle timing for gc path. What: /sys/fs/f2fs//iostat_enable Date: August 2017 diff --git a/Documentation/ABI/testing/sysfs-platform-lg-laptop b/Documentation/ABI/testing/sysfs-platform-lg-laptop new file mode 100644 index 000000000000..cf47749b19df --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-lg-laptop @@ -0,0 +1,35 @@ +What: /sys/devices/platform/lg-laptop/reader_mode +Date: October 2018 +KernelVersion: 4.20 +Contact: "Matan Ziv-Av +Description: + Control reader mode. 1 means on, 0 means off. + +What: /sys/devices/platform/lg-laptop/fn_lock +Date: October 2018 +KernelVersion: 4.20 +Contact: "Matan Ziv-Av +Description: + Control FN lock mode. 1 means on, 0 means off. + +What: /sys/devices/platform/lg-laptop/battery_care_limit +Date: October 2018 +KernelVersion: 4.20 +Contact: "Matan Ziv-Av +Description: + Maximal battery charge level. Accepted values are 80 or 100. + +What: /sys/devices/platform/lg-laptop/fan_mode +Date: October 2018 +KernelVersion: 4.20 +Contact: "Matan Ziv-Av +Description: + Control fan mode. 1 for performance mode, 0 for silent mode. + +What: /sys/devices/platform/lg-laptop/usb_charge +Date: October 2018 +KernelVersion: 4.20 +Contact: "Matan Ziv-Av +Description: + Control USB port charging when device is turned off. + 1 means on, 0 means off. diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 2f813d644c69..18b7dc929234 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -99,7 +99,7 @@ Description: this file, the suspend image will be as small as possible. Reading from this file will display the current image size - limit, which is set to 500 MB by default. + limit, which is set to around 2/5 of available RAM by default. What: /sys/power/pm_trace Date: August 2006 diff --git a/Documentation/PCI/00-INDEX b/Documentation/PCI/00-INDEX deleted file mode 100644 index 206b1d5c1e71..000000000000 --- a/Documentation/PCI/00-INDEX +++ /dev/null @@ -1,26 +0,0 @@ -00-INDEX - - this file -acpi-info.txt - - info on how PCI host bridges are represented in ACPI -MSI-HOWTO.txt - - the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ. -PCIEBUS-HOWTO.txt - - a guide describing the PCI Express Port Bus driver -pci-error-recovery.txt - - info on PCI error recovery -pci-iov-howto.txt - - the PCI Express I/O Virtualization HOWTO -pci.txt - - info on the PCI subsystem for device driver authors -pcieaer-howto.txt - - the PCI Express Advanced Error Reporting Driver Guide HOWTO -endpoint/pci-endpoint.txt - - guide to add endpoint controller driver and endpoint function driver. -endpoint/pci-endpoint-cfs.txt - - guide to use configfs to configure the PCI endpoint function. -endpoint/pci-test-function.txt - - specification of *PCI test* function device. -endpoint/pci-test-howto.txt - - userguide for PCI endpoint test function. -endpoint/function/binding/ - - binding documentation for PCI endpoint function diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt index e40cf0fb58d7..040479f437a5 100644 --- a/Documentation/PCI/endpoint/pci-test-howto.txt +++ b/Documentation/PCI/endpoint/pci-test-howto.txt @@ -99,17 +99,20 @@ Note that the devices listed here correspond to the value populated in 1.4 above 2.2 Using Endpoint Test function Device pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint -tests. Before pcitest.sh can be used pcitest.c should be compiled using the -following commands. +tests. To compile this tool the following commands should be used: - cd - make headers_install ARCH=arm - arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest - cp pcitest /usr/sbin/ - cp tools/pci/pcitest.sh + # cd + # make -C tools/pci + +or if you desire to compile and install in your system: + + # cd + # make -C tools/pci install + +The tool and script will be located in /usr/bin/ 2.2.1 pcitest.sh Output - # ./pcitest.sh + # pcitest.sh BAR tests BAR0: OKAY diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt index 688b69121e82..0b6bb3ef449e 100644 --- a/Documentation/PCI/pci-error-recovery.txt +++ b/Documentation/PCI/pci-error-recovery.txt @@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI error event will be platform-dependent, but will follow the general sequence described below. -STEP 0: Error Event: ERR_NONFATAL +STEP 0: Error Event ------------------- A PCI bus error is detected by the PCI hardware. On powerpc, the slot is isolated, in that all I/O is blocked: all reads return 0xffffffff, @@ -228,7 +228,13 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations). If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform proceeds to STEP 4 (Slot Reset) -STEP 3: Slot Reset +STEP 3: Link Reset +------------------ +The platform resets the link. This is a PCI-Express specific step +and is done whenever a fatal error has been detected that can be +"solved" by resetting the link. + +STEP 4: Slot Reset ------------------ In response to a return value of PCI_ERS_RESULT_NEED_RESET, the @@ -314,7 +320,7 @@ Failure). >>> However, it probably should. -STEP 4: Resume Operations +STEP 5: Resume Operations ------------------------- The platform will call the resume() callback on all affected device drivers if all drivers on the segment have returned @@ -326,7 +332,7 @@ a result code. At this point, if a new error happens, the platform will restart a new error recovery sequence. -STEP 5: Permanent Failure +STEP 6: Permanent Failure ------------------------- A "permanent failure" has occurred, and the platform cannot recover the device. The platform will call error_detected() with a @@ -349,27 +355,6 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt for additional detail on real-life experience of the causes of software errors. -STEP 0: Error Event: ERR_FATAL -------------------- -PCI bus error is detected by the PCI hardware. On powerpc, the slot is -isolated, in that all I/O is blocked: all reads return 0xffffffff, all -writes are ignored. - -STEP 1: Remove devices --------------------- -Platform removes the devices depending on the error agent, it could be -this port for all subordinates or upstream component (likely downstream -port) - -STEP 2: Reset link --------------------- -The platform resets the link. This is a PCI-Express specific step and is -done whenever a fatal error has been detected that can be "solved" by -resetting the link. - -STEP 3: Re-enumerate the devices --------------------- -Initiates the re-enumeration. Conclusion; General Remarks --------------------------- diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX deleted file mode 100644 index f46980c060aa..000000000000 --- a/Documentation/RCU/00-INDEX +++ /dev/null @@ -1,34 +0,0 @@ -00-INDEX - - This file -arrayRCU.txt - - Using RCU to Protect Read-Mostly Arrays -checklist.txt - - Review Checklist for RCU Patches -listRCU.txt - - Using RCU to Protect Read-Mostly Linked Lists -lockdep.txt - - RCU and lockdep checking -lockdep-splat.txt - - RCU Lockdep splats explained. -NMI-RCU.txt - - Using RCU to Protect Dynamic NMI Handlers -rcu_dereference.txt - - Proper care and feeding of return values from rcu_dereference() -rcubarrier.txt - - RCU and Unloadable Modules -rculist_nulls.txt - - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU -rcuref.txt - - Reference-count design for elements of lists/arrays protected by RCU -rcu.txt - - RCU Concepts -RTFP.txt - - List of RCU papers (bibliography) going back to 1980. -stallwarn.txt - - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress) -torture.txt - - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) -UP.txt - - RCU on Uniprocessor Systems -whatisRCU.txt - - What is RCU? diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index f5120a00f511..1d2051c0c3fc 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1227,9 +1227,11 @@ to overflow the counter, this approach corrects the CPU enters the idle loop from process context.

The ->dynticks field counts the corresponding -CPU's transitions to and from dyntick-idle mode, so that this counter -has an even value when the CPU is in dyntick-idle mode and an odd -value otherwise. +CPU's transitions to and from either dyntick-idle or user mode, so +that this counter has an even value when the CPU is in dyntick-idle +mode or user mode and an odd value otherwise. The transitions to/from +user mode need to be counted for user mode adaptive-ticks support +(see timers/NO_HZ.txt).

The ->rcu_need_heavy_qs field is used to record the fact that the RCU core code would really like to @@ -1372,8 +1374,7 @@ that is, if the CPU is currently idle. Accessor Functions

The following listing shows the -rcu_get_root(), rcu_for_each_node_breadth_first, -rcu_for_each_nonleaf_node_breadth_first(), and +rcu_get_root(), rcu_for_each_node_breadth_first and rcu_for_each_leaf_node() function and macros:

@@ -1386,13 +1387,9 @@ Accessor Functions
   7   for ((rnp) = &(rsp)->node[0]; \
   8        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
   9
- 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- 11   for ((rnp) = &(rsp)->node[0]; \
- 12        (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
- 13
- 14 #define rcu_for_each_leaf_node(rsp, rnp) \
- 15   for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
- 16        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ 10 #define rcu_for_each_leaf_node(rsp, rnp) \
+ 11   for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
+ 12        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 

The rcu_get_root() simply returns a pointer to the @@ -1405,10 +1402,7 @@ macro takes advantage of the layout of the rcu_node structures in the rcu_state structure's ->node[] array, performing a breadth-first traversal by simply traversing the array in order. -The rcu_for_each_nonleaf_node_breadth_first() macro operates -similarly, but traverses only the first part of the array, thus excluding -the leaf rcu_node structures. -Finally, the rcu_for_each_leaf_node() macro traverses only +Similarly, the rcu_for_each_leaf_node() macro traverses only the last part of the array, thus traversing only the leaf rcu_node structures. @@ -1416,15 +1410,14 @@ the last part of the array, thus traversing only the leaf   Quick Quiz: - What do rcu_for_each_nonleaf_node_breadth_first() and + What does rcu_for_each_leaf_node() do if the rcu_node tree contains only a single node? Answer: In the single-node case, - rcu_for_each_nonleaf_node_breadth_first() is a no-op - and rcu_for_each_leaf_node() traverses the single node. + rcu_for_each_leaf_node() traverses the single node.   diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index 7394f034be65..e62c7c34a369 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -12,10 +12,9 @@ high efficiency and minimal disturbance, expedited grace periods accept lower efficiency and significant disturbance to attain shorter latencies.

-There are three flavors of RCU (RCU-bh, RCU-preempt, and RCU-sched), -but only two flavors of expedited grace periods because the RCU-bh -expedited grace period maps onto the RCU-sched expedited grace period. -Each of the remaining two implementations is covered in its own section. +There are two flavors of RCU (RCU-preempt and RCU-sched), with an earlier +third RCU-bh flavor having been implemented in terms of the other two. +Each of the two implementations is covered in its own section.

  1. @@ -158,7 +157,7 @@ whether or not the current CPU is in an RCU read-side critical section. The best that sync_sched_exp_handler() can do is to check for idle, on the off-chance that the CPU went idle while the IPI was in flight. -If the CPU is idle, then tt>sync_sched_exp_handler() reports +If the CPU is idle, then sync_sched_exp_handler() reports the quiescent state.

    diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 49690228b1c6..43c4e2f05f40 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1306,8 +1306,6 @@ doing so would degrade real-time response.

    This non-requirement appeared with preemptible RCU. -If you need a grace period that waits on non-preemptible code regions, use -RCU-sched.

    Parallelism Facts of Life

    @@ -2165,14 +2163,9 @@ however, this is not a panacea because there would be severe restrictions on what operations those callbacks could invoke.

    -Perhaps surprisingly, synchronize_rcu(), -synchronize_rcu_bh() -(discussed below), -synchronize_sched(), +Perhaps surprisingly, synchronize_rcu() and synchronize_rcu_expedited(), -synchronize_rcu_bh_expedited(), and -synchronize_sched_expedited() -will all operate normally +will operate normally during very early boot, the reason being that there is only one CPU and preemption is disabled. This means that the call synchronize_rcu() (or friends) @@ -2269,12 +2262,23 @@ Thankfully, RCU update-side primitives, including The name notwithstanding, some Linux-kernel architectures can have nested NMIs, which RCU must handle correctly. Andy Lutomirski -surprised me +surprised me with this requirement; he also kindly surprised me with -an algorithm +an algorithm that meets this requirement. +

    +Furthermore, NMI handlers can be interrupted by what appear to RCU +to be normal interrupts. +One way that this can happen is for code that directly invokes +rcu_irq_enter() and rcu_irq_exit() to be called +from an NMI handler. +This astonishing fact of life prompted the current code structure, +which has rcu_irq_enter() invoking rcu_nmi_enter() +and rcu_irq_exit() invoking rcu_nmi_exit(). +And yes, I also learned of this requirement the hard way. +

    Loadable Modules

    @@ -2394,30 +2398,9 @@ when invoked from a CPU-hotplug notifier.

    RCU depends on the scheduler, and the scheduler uses RCU to protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -rcu_read_unlock_special() while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -

    -Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a rcu_read_unlock() to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -

    -For RCU's part, the preemptible-RCU rcu_read_unlock() -implementation must be written carefully to avoid similar deadlocks. +The preemptible-RCU rcu_read_unlock() +implementation must therefore be written carefully to avoid deadlocks +involving the scheduler's runqueue and priority-inheritance locks. In particular, rcu_read_unlock() must tolerate an interrupt where the interrupt handler invokes both rcu_read_lock() and rcu_read_unlock(). @@ -2426,7 +2409,7 @@ negative nesting levels to avoid destructive recursion via interrupt handler's use of RCU.

    -This pair of mutual scheduler-RCU requirements came as a +This scheduler-RCU requirement came as a complete surprise.

    @@ -2437,9 +2420,28 @@ when running context-switch-heavy workloads when built with CONFIG_NO_HZ_FULL=y did come as a surprise [PDF]. RCU has made good progress towards meeting this requirement, even -for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +for context-switch-heavy CONFIG_NO_HZ_FULL=y workloads, but there is room for further improvement. +

    +In the past, it was forbidden to disable interrupts across an +rcu_read_unlock() unless that interrupt-disabled region +of code also included the matching rcu_read_lock(). +Violating this restriction could result in deadlocks involving the +scheduler's runqueue and priority-inheritance spinlocks. +This restriction was lifted when interrupt-disabled calls to +rcu_read_unlock() started deferring the reporting of +the resulting RCU-preempt quiescent state until the end of that +interrupts-disabled region. +This deferred reporting means that the scheduler's runqueue and +priority-inheritance locks cannot be held while reporting an RCU-preempt +quiescent state, which lifts the earlier restriction, at least from +a deadlock perspective. +Unfortunately, real-time systems using RCU priority boosting may +need this restriction to remain in effect because deferred +quiescent-state reporting also defers deboosting, which in turn +degrades real-time latencies. +

    Tracing and RCU

    @@ -2850,15 +2852,22 @@ The other four flavors are listed below, with requirements for each described in a separate section.

      -
    1. Bottom-Half Flavor -
    2. Sched Flavor +
    3. Bottom-Half Flavor (Historical) +
    4. Sched Flavor (Historical)
    5. Sleepable RCU
    6. Tasks RCU -
    7. - Waiting for Multiple Grace Periods
    -

    Bottom-Half Flavor

    +

    Bottom-Half Flavor (Historical)

    + +

    +The RCU-bh flavor of RCU has since been expressed in terms of +the other RCU flavors as part of a consolidation of the three +flavors into a single flavor. +The read-side API remains, and continues to disable softirq and to +be accounted for by lockdep. +Much of the material in this section is therefore strictly historical +in nature.

    The softirq-disable (AKA “bottom-half”, @@ -2918,8 +2927,20 @@ includes call_rcu_bh(), rcu_barrier_bh(), and rcu_read_lock_bh_held(). +However, the update-side APIs are now simple wrappers for other RCU +flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt +otherwise. + +

    Sched Flavor (Historical)

    -

    Sched Flavor

    +

    +The RCU-sched flavor of RCU has since been expressed in terms of +the other RCU flavors as part of a consolidation of the three +flavors into a single flavor. +The read-side API remains, and continues to disable preemption and to +be accounted for by lockdep. +Much of the material in this section is therefore strictly historical +in nature.

    Before preemptible RCU, waiting for an RCU grace period had the @@ -3139,94 +3160,14 @@ The tasks-RCU API is quite compact, consisting only of call_rcu_tasks(), synchronize_rcu_tasks(), and rcu_barrier_tasks(). - -

    -Waiting for Multiple Grace Periods

    - -

    -Perhaps you have an RCU protected data structure that is accessed from -RCU read-side critical sections, from softirq handlers, and from -hardware interrupt handlers. -That is three flavors of RCU, the normal flavor, the bottom-half flavor, -and the sched flavor. -How to wait for a compound grace period? - -

    -The best approach is usually to “just say no!” and -insert rcu_read_lock() and rcu_read_unlock() -around each RCU read-side critical section, regardless of what -environment it happens to be in. -But suppose that some of the RCU read-side critical sections are -on extremely hot code paths, and that use of CONFIG_PREEMPT=n -is not a viable option, so that rcu_read_lock() and -rcu_read_unlock() are not free. -What then? - -

    -You could wait on all three grace periods in succession, as follows: - -

    -
    - 1 synchronize_rcu();
    - 2 synchronize_rcu_bh();
    - 3 synchronize_sched();
    -
    -
    - -

    -This works, but triples the update-side latency penalty. -In cases where this is not acceptable, synchronize_rcu_mult() -may be used to wait on all three flavors of grace period concurrently: - -

    -
    - 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
    -
    -
    - -

    -But what if it is necessary to also wait on SRCU? -This can be done as follows: - -

    -
    - 1 static void call_my_srcu(struct rcu_head *head,
    - 2        void (*func)(struct rcu_head *head))
    - 3 {
    - 4   call_srcu(&my_srcu, head, func);
    - 5 }
    - 6
    - 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
    -
    -
    - -

    -If you needed to wait on multiple different flavors of SRCU -(but why???), you would need to create a wrapper function resembling -call_my_srcu() for each SRCU flavor. - - - - - - - - -
     
    Quick Quiz:
    - But what if I need to wait for multiple RCU flavors, but I also need - the grace periods to be expedited? -
    Answer:
    - If you are using expedited grace periods, there should be less penalty - for waiting on them in succession. - But if that is nevertheless a problem, you can use workqueues - or multiple kthreads to wait on the various expedited grace - periods concurrently. -
     
    - -

    -Again, it is usually better to adjust the RCU read-side critical sections -to use a single flavor of RCU, but when this is not feasible, you can use -synchronize_rcu_mult(). +In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, +so these APIs map to +call_rcu(), +synchronize_rcu(), and +rcu_barrier(), respectively. +In CONFIG_PREEMPT=y kernels, trampolines can be preempted, +and these three APIs are therefore implemented by separate functions +that check for voluntary context switches.

    Possible Future Changes

    @@ -3237,12 +3178,6 @@ If this becomes a serious problem, it will be necessary to rework the grace-period state machine so as to avoid the need for the additional latency. -

    -Expedited grace periods scan the CPUs, so their latency and overhead -increases with increasing numbers of CPUs. -If this becomes a serious problem on large systems, it will be necessary -to do some redesign to avoid this scalability problem. -

    RCU disables CPU hotplug in a few places, perhaps most notably in the rcu_barrier() operations. @@ -3287,11 +3222,6 @@ Please note that arrangements that require RCU to remap CPU numbers will require extremely good demonstration of need and full exploration of alternatives. -

    -There is an embarrassingly large number of flavors of RCU, and this -number has been increasing over time. -Perhaps it will be possible to combine some at some future date. -

    RCU's various kthreads are reasonably recent additions. It is quite likely that adjustments will be required to more gracefully diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 7d4ae110c2c9..721b3e426515 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt @@ -87,7 +87,3 @@ o Where can I find more information on RCU? See the RTFP.txt file in this directory. Or point your browser at http://www.rdrop.com/users/paulmck/RCU/. - -o What are all these files in this directory? - - See 00-INDEX for the list. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f99cf11b314b..491043fd976f 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -16,12 +16,9 @@ o A CPU looping in an RCU read-side critical section. o A CPU looping with interrupts disabled. -o A CPU looping with preemption disabled. This condition can - result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh - stalls. +o A CPU looping with preemption disabled. -o A CPU looping with bottom halves disabled. This condition can - result in RCU-sched and RCU-bh stalls. +o A CPU looping with bottom halves disabled. o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel without invoking schedule(). If the looping in the kernel is @@ -87,9 +84,9 @@ o A hardware failure. This is quite unlikely, but has occurred This resulted in a series of RCU CPU stall warnings, eventually leading the realization that the CPU had failed. -The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall -warning. Note that SRCU does -not- have CPU stall warnings. Please note -that RCU only detects CPU stalls when there is a grace period in progress. +The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning. +Note that SRCU does -not- have CPU stall warnings. Please note that +RCU only detects CPU stalls when there is a grace period in progress. No grace period, no CPU stall warnings. To diagnose the cause of the stall, inspect the stack traces. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index c2a7facf7ff9..86d82f7f3500 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -934,7 +934,8 @@ c. Do you need to treat NMI handlers, hardirq handlers, d. Do you need RCU grace periods to complete even in the face of softirq monopolization of one or more of the CPUs? For example, is your code subject to network-based denial-of-service - attacks? If so, you need RCU-bh. + attacks? If so, you should disable softirq across your readers, + for example, by using rcu_read_lock_bh(). e. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt new file mode 100644 index 000000000000..b8ca28b60215 --- /dev/null +++ b/Documentation/accounting/psi.txt @@ -0,0 +1,73 @@ +================================ +PSI - Pressure Stall Information +================================ + +:Date: April, 2018 +:Author: Johannes Weiner + +When CPU, memory or IO devices are contended, workloads experience +latency spikes, throughput losses, and run the risk of OOM kills. + +Without an accurate measure of such contention, users are forced to +either play it safe and under-utilize their hardware resources, or +roll the dice and frequently suffer the disruptions resulting from +excessive overcommit. + +The psi feature identifies and quantifies the disruptions caused by +such resource crunches and the time impact it has on complex workloads +or even entire systems. + +Having an accurate measure of productivity losses caused by resource +scarcity aids users in sizing workloads to hardware--or provisioning +hardware according to workload demand. + +As psi aggregates this information in realtime, systems can be managed +dynamically using techniques such as load shedding, migrating jobs to +other systems or data centers, or strategically pausing or killing low +priority or restartable batch jobs. + +This allows maximizing hardware utilization without sacrificing +workload health or risking major disruptions such as OOM kills. + +Pressure interface +================== + +Pressure information for each resource is exported through the +respective file in /proc/pressure/ -- cpu, memory, and io. + +The format for CPU is as such: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +and for memory and IO: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +The "some" line indicates the share of time in which at least some +tasks are stalled on a given resource. + +The "full" line indicates the share of time in which all non-idle +tasks are stalled on a given resource simultaneously. In this state +actual CPU cycles are going to waste, and a workload that spends +extended time in this state is considered to be thrashing. This has +severe impact on performance, and it's useful to distinguish this +situation from a state where some tasks are stalled but the CPU is +still doing productive work. As such, time spent in this subset of the +stall state is tracked separately and exported in the "full" averages. + +The ratios are tracked as recent trends over ten, sixty, and three +hundred second windows, which gives insight into short term events as +well as medium and long term trends. The total absolute stall time is +tracked and exported as well, to allow detection of latency spikes +which wouldn't necessarily make a dent in the time averages, or to +average trends over custom time frames. + +Cgroup2 interface +================= + +In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem +mounted, pressure stall information is also tracked for tasks grouped +into cgroups. Each subdirectory in the cgroupfs mountpoint contains +cpu.pressure, memory.pressure, and io.pressure files; the format is +the same as the /proc/pressure/ files. diff --git a/Documentation/admin-guide/LSM/Yama.rst b/Documentation/admin-guide/LSM/Yama.rst index 13468ea696b7..d0a060de3973 100644 --- a/Documentation/admin-guide/LSM/Yama.rst +++ b/Documentation/admin-guide/LSM/Yama.rst @@ -64,8 +64,8 @@ The sysctl settings (writable only with ``CAP_SYS_PTRACE``) are: Using ``PTRACE_TRACEME`` is unchanged. 2 - admin-only attach: - only processes with ``CAP_SYS_PTRACE`` may use ptrace - with ``PTRACE_ATTACH``, or through children calling ``PTRACE_TRACEME``. + only processes with ``CAP_SYS_PTRACE`` may use ptrace, either with + ``PTRACE_ATTACH`` or through children calling ``PTRACE_TRACEME``. 3 - no attach: no processes may use ptrace with ``PTRACE_ATTACH`` nor via diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index 15ea785b2dfa..0797eec76be1 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -51,8 +51,7 @@ Documentation - There are various README files in the Documentation/ subdirectory: these typically contain kernel-specific installation notes for some - drivers for example. See Documentation/00-INDEX for a list of what - is contained in each file. Please read the + drivers for example. Please read the :ref:`Documentation/process/changes.rst ` file, as it contains information about the problems, which may result by upgrading your kernel. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 184193bcb262..476722b7b636 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -966,6 +966,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated. + cpu.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for CPU. See + Documentation/accounting/psi.txt for details. + Memory ------ @@ -1127,6 +1133,10 @@ PAGE_SIZE multiple when read back. disk readahead. For now OOM in memory cgroup kills tasks iff shortage has happened inside page fault. + This event is not raised if the OOM killer is not + considered as an option, e.g. for failed high-order + allocations. + oom_kill The number of processes belonging to this cgroup killed by any kind of OOM killer. @@ -1271,6 +1281,12 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management. + memory.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for memory. See + Documentation/accounting/psi.txt for details. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1408,6 +1424,12 @@ IO Interface Files 8:16 rbps=2097152 wbps=max riops=max wiops=max + io.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for IO. See + Documentation/accounting/psi.txt for details. + Writeback ~~~~~~~~~ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 92eb1f42240d..81d1d5a74728 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -856,6 +856,11 @@ causing system reset or hang due to sending INIT from AP to BSP. + disable_counter_freezing [HW] + Disable Intel PMU counter freezing feature. + The feature only exists starting from + Arch Perfmon v4 (Skylake and newer). + disable_ddw [PPC/PSERIES] Disable Dynamic DMA Window support. Use this if to workaround buggy firmware. @@ -1063,7 +1068,7 @@ earlyprintk=serial[,0x...[,baudrate]] earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] - earlyprintk=pciserial,bus:device.function[,baudrate] + earlyprintk=pciserial[,force],bus:device.function[,baudrate] earlyprintk=xdbc[xhciController#] earlyprintk is useful when the kernel crashes before @@ -1095,6 +1100,10 @@ The sclp output can only be used on s390. + The optional "force" to "pciserial" enables use of a + PCI device even when its classcode is not of the + UART class. + edac_report= [HW,EDAC] Control how to report EDAC event Format: {"on" | "off" | "force"} on: enable EDAC to report H/W event. May be overridden @@ -1385,6 +1394,11 @@ hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs. If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. + + hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations + which allow the hypervisor to 'idle' the + guest on lock contention. + keep_bootcon [KNL] Do not unregister boot console at start. This is only useful for debugging when something happens in the window @@ -1749,12 +1763,24 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. + iommu.strict= [ARM64] Configure TLB invalidation behaviour + Format: { "0" | "1" } + 0 - Lazy mode. + Request that DMA unmap operations use deferred + invalidation of hardware TLBs, for increased + throughput at the cost of reduced device isolation. + Will fall back to strict mode if not supported by + the relevant IOMMU driver. + 1 - Strict mode (default). + DMA unmap operations invalidate IOMMU hardware TLBs + synchronously. + iommu.passthrough= [ARM64] Configure DMA to bypass the IOMMU by default. Format: { "0" | "1" } 0 - Use IOMMU translation for DMA. 1 - Bypass the IOMMU for DMA. - unset - Use IOMMU translation for DMA. + unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in @@ -2274,6 +2300,8 @@ ltpc= [NET] Format: ,, + lsm.debug [SECURITY] Enable LSM initialization debugging output. + machvec= [IA-64] Force the use of a particular machine-vector (machvec) in a generic kernel. Example: machvec=hpzx1_swiotlb @@ -2404,7 +2432,7 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. - memtest= [KNL,X86,ARM] Enable memtest + memtest= [KNL,X86,ARM,PPC] Enable memtest Format: default : 0 Specifies the number of memtest passes to be @@ -3540,14 +3568,14 @@ In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. - Invocation of these CPUs' RCU callbacks will - be offloaded to "rcuox/N" kthreads created for - that purpose, where "x" is "b" for RCU-bh, "p" - for RCU-preempt, and "s" for RCU-sched, and "N" - is the CPU number. This reduces OS jitter on the - offloaded CPUs, which can be useful for HPC and - real-time workloads. It can also improve energy - efficiency for asymmetric multiprocessors. + Invocation of these CPUs' RCU callbacks will be + offloaded to "rcuox/N" kthreads created for that + purpose, where "x" is "p" for RCU-preempt, and + "s" for RCU-sched, and "N" is the CPU number. + This reduces OS jitter on the offloaded CPUs, + which can be useful for HPC and real-time + workloads. It can also improve energy efficiency + for asymmetric multiprocessors. rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs @@ -3601,7 +3629,14 @@ Set required age in jiffies for a given grace period before RCU starts soliciting quiescent-state help from - rcu_note_context_switch(). + rcu_note_context_switch(). If not specified, the + kernel will calculate a value based on the most + recent settings of rcutree.jiffies_till_first_fqs + and rcutree.jiffies_till_next_fqs. + This calculated value may be viewed in + rcutree.jiffies_to_sched_qs. Any attempt to + set rcutree.jiffies_to_sched_qs will be + cheerfully overwritten. rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to @@ -3869,12 +3904,6 @@ rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests - rcupdate.rcu_self_test_bh= [KNL] - Run the RCU bh early boot self tests - - rcupdate.rcu_self_test_sched= [KNL] - Run the RCU sched early boot self tests - rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, @@ -4610,7 +4639,8 @@ usbcore.old_scheme_first= [USB] Start with the old device initialization - scheme (default 0 = off). + scheme, applies only to low and full-speed devices + (default 0 = off). usbcore.usbfs_memory_mb= [USB] Memory limit (in MB) for buffers allocated by @@ -4825,6 +4855,18 @@ This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. + vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y. + May slow down system boot speed, especially when + enabled on systems with a large amount of memory. + All options are enabled by default, and this + interface is meant to allow for selectively + enabling or disabling specific virtual memory + debugging features. + + Available options are: + P Enable page structure init time poisoning + - Disable all of the above options + vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact size of . This can be used to increase the minimum size (128MB on x86). It can also be used to diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst index bae52b845de0..b85dd80510b0 100644 --- a/Documentation/admin-guide/l1tf.rst +++ b/Documentation/admin-guide/l1tf.rst @@ -553,7 +553,7 @@ When nested virtualization is in use, three operating systems are involved: the bare metal hypervisor, the nested hypervisor and the nested virtual machine. VMENTER operations from the nested hypervisor into the nested guest will always be processed by the bare metal hypervisor. If KVM is the -bare metal hypervisor it wiil: +bare metal hypervisor it will: - Flush the L1D cache on every switch from the nested hypervisor to the nested virtual machine, so that the nested hypervisor's secrets are not diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index ceead68c2df7..8edb35f11317 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -29,6 +29,7 @@ the Linux memory management. hugetlbpage idle_page_tracking ksm + memory-hotplug numa_memory_policy pagemap soft-dirty diff --git a/Documentation/memory-hotplug.txt b/Documentation/admin-guide/mm/memory-hotplug.rst similarity index 73% rename from Documentation/memory-hotplug.txt rename to Documentation/admin-guide/mm/memory-hotplug.rst index 7f49ebf3ddb2..5c4432c96c4b 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -1,47 +1,29 @@ +.. _admin_guide_memory_hotplug: + ============== Memory Hotplug ============== :Created: Jul 28 2007 -:Updated: Add description of notifier of memory hotplug: Oct 11 2007 +:Updated: Add some details about locking internals: Aug 20 2018 This document is about memory hotplug including how-to-use and current status. Because Memory Hotplug is still under development, contents of this text will be changed often. -.. CONTENTS - - 1. Introduction - 1.1 purpose of memory hotplug - 1.2. Phases of memory hotplug - 1.3. Unit of Memory online/offline operation - 2. Kernel Configuration - 3. sysfs files for memory hotplug - 4. Physical memory hot-add phase - 4.1 Hardware(Firmware) Support - 4.2 Notify memory hot-add event by hand - 5. Logical Memory hot-add phase - 5.1. State of memory - 5.2. How to online memory - 6. Logical memory remove - 6.1 Memory offline and ZONE_MOVABLE - 6.2. How to offline memory - 7. Physical memory remove - 8. Memory hotplug event notifier - 9. Future Work List - +.. contents:: :local: .. note:: (1) x86_64's has special implementation for memory hotplug. This text does not describe it. - (2) This text assumes that sysfs is mounted at /sys. + (2) This text assumes that sysfs is mounted at ``/sys``. Introduction ============ -purpose of memory hotplug +Purpose of memory hotplug ------------------------- Memory Hotplug allows users to increase/decrease the amount of memory. @@ -57,7 +39,6 @@ hardware which supports memory power management. Linux memory hotplug is designed for both purpose. - Phases of memory hotplug ------------------------ @@ -92,7 +73,6 @@ phase by hand. (However, if you writes udev's hotplug scripts for memory hotplug, these phases can be execute in seamless way.) - Unit of Memory online/offline operation --------------------------------------- @@ -107,10 +87,9 @@ unit upon which memory online/offline operations are to be performed. The default size of a memory block is the same as memory section size unless an architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.) -To determine the size (in bytes) of a memory block please read this file: - -/sys/devices/system/memory/block_size_bytes +To determine the size (in bytes) of a memory block please read this file:: + /sys/devices/system/memory/block_size_bytes Kernel Configuration ==================== @@ -119,22 +98,22 @@ To use memory hotplug feature, kernel must be compiled with following config options. - For all memory hotplug: - - Memory model -> Sparse Memory (CONFIG_SPARSEMEM) - - Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG) + - Memory model -> Sparse Memory (``CONFIG_SPARSEMEM``) + - Allow for memory hot-add (``CONFIG_MEMORY_HOTPLUG``) - To enable memory removal, the following are also necessary: - - Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE) - - Page Migration (CONFIG_MIGRATION) + - Allow for memory hot remove (``CONFIG_MEMORY_HOTREMOVE``) + - Page Migration (``CONFIG_MIGRATION``) - For ACPI memory hotplug, the following are also necessary: - - Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY) + - Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``) - This option can be kernel module. - As a related configuration, if your box has a feature of NUMA-node hotplug via ACPI, then this option is necessary too. - ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu) - (CONFIG_ACPI_CONTAINER). + (``CONFIG_ACPI_CONTAINER``). This option can be kernel module too. @@ -145,10 +124,11 @@ sysfs files for memory hotplug ============================== All memory blocks have their device information in sysfs. Each memory block -is described under /sys/devices/system/memory as: +is described under ``/sys/devices/system/memory`` as:: /sys/devices/system/memory/memoryXXX - (XXX is the memory block id.) + +where XXX is the memory block id. For the memory block covered by the sysfs directory. It is expected that all memory sections in this range are present and no memory holes exist in the @@ -157,7 +137,7 @@ the existence of one should not affect the hotplug capabilities of the memory block. For example, assume 1GiB memory block size. A device for a memory starting at -0x100000000 is /sys/device/system/memory/memory4:: +0x100000000 is ``/sys/device/system/memory/memory4``:: (0x100000000 / 1Gib = 4) @@ -165,11 +145,11 @@ This device covers address range [0x100000000 ... 0x140000000) Under each memory block, you can see 5 files: -- /sys/devices/system/memory/memoryXXX/phys_index -- /sys/devices/system/memory/memoryXXX/phys_device -- /sys/devices/system/memory/memoryXXX/state -- /sys/devices/system/memory/memoryXXX/removable -- /sys/devices/system/memory/memoryXXX/valid_zones +- ``/sys/devices/system/memory/memoryXXX/phys_index`` +- ``/sys/devices/system/memory/memoryXXX/phys_device`` +- ``/sys/devices/system/memory/memoryXXX/state`` +- ``/sys/devices/system/memory/memoryXXX/removable`` +- ``/sys/devices/system/memory/memoryXXX/valid_zones`` =================== ============================================================ ``phys_index`` read-only and contains memory block id, same as XXX. @@ -207,13 +187,15 @@ Under each memory block, you can see 5 files: These directories/files appear after physical memory hotplug phase. If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed -via symbolic links located in the /sys/devices/system/node/node* directories. +via symbolic links located in the ``/sys/devices/system/node/node*`` directories. + +For example:: -For example: -/sys/devices/system/node/node0/memory9 -> ../../memory/memory9 + /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 -A backlink will also be created: -/sys/devices/system/memory/memory9/node0 -> ../../node/node0 +A backlink will also be created:: + + /sys/devices/system/memory/memory9/node0 -> ../../node/node0 .. _memory_hotplug_physical_mem: @@ -240,7 +222,6 @@ If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004", calls hotplug code for all of objects which are defined in it. If memory device is found, memory hotplug code will be called. - Notify memory hot-add event by hand ----------------------------------- @@ -251,8 +232,9 @@ CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86 if hotplug is supported, although for x86 this should be handled by ACPI notification. -Probe interface is located at -/sys/devices/system/memory/probe +Probe interface is located at:: + + /sys/devices/system/memory/probe You can tell the physical address of new memory to the kernel by:: @@ -263,7 +245,6 @@ memory_block_size] memory range is hot-added. In this case, hotplug script is not called (in current implementation). You'll have to online memory by yourself. Please see :ref:`memory_hotplug_how_to_online_memory`. - Logical Memory hot-add phase ============================ @@ -301,7 +282,7 @@ This sets a global policy and impacts all memory blocks that will subsequently be hotplugged. Currently offline blocks keep their state. It is possible, under certain circumstances, that some memory blocks will be added but will fail to online. User space tools can check their "state" files -(/sys/devices/system/memory/memoryXXX/state) and try to online them manually. +(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually. If the automatic onlining wasn't requested, failed, or some memory block was offlined it is possible to change the individual block's state by writing to the @@ -334,8 +315,6 @@ available memory will be increased. This may be changed in future. - - Logical memory remove ===================== @@ -413,87 +392,45 @@ Need more implementation yet.... - Notification completion of remove works by OS to firmware. - Guard from remove if not yet. -Memory hotplug event notifier -============================= - -Hotplugging events are sent to a notification queue. - -There are six types of notification defined in include/linux/memory.h: - -MEM_GOING_ONLINE - Generated before new memory becomes available in order to be able to - prepare subsystems to handle memory. The page allocator is still unable - to allocate from the new memory. - -MEM_CANCEL_ONLINE - Generated if MEMORY_GOING_ONLINE fails. - -MEM_ONLINE - Generated when memory has successfully brought online. The callback may - allocate pages from the new memory. - -MEM_GOING_OFFLINE - Generated to begin the process of offlining memory. Allocations are no - longer possible from the memory but some of the memory to be offlined - is still in use. The callback can be used to free memory known to a - subsystem from the indicated memory block. - -MEM_CANCEL_OFFLINE - Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from - the memory block that we attempted to offline. - -MEM_OFFLINE - Generated after offlining memory is complete. - -A callback routine can be registered by calling:: - - hotplug_memory_notifier(callback_func, priority) - -Callback functions with higher values of priority are called before callback -functions with lower values. -A callback function must have the following prototype:: +Locking Internals +================= - int callback_func( - struct notifier_block *self, unsigned long action, void *arg); +When adding/removing memory that uses memory block devices (i.e. ordinary RAM), +the device_hotplug_lock should be held to: -The first argument of the callback function (self) is a pointer to the block -of the notifier chain that points to the callback function itself. -The second argument (action) is one of the event types described above. -The third argument (arg) passes a pointer of struct memory_notify:: +- synchronize against online/offline requests (e.g. via sysfs). This way, memory + block devices can only be accessed (.online/.state attributes) by user + space once memory has been fully added. And when removing memory, we + know nobody is in critical sections. +- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC) - struct memory_notify { - unsigned long start_pfn; - unsigned long nr_pages; - int status_change_nid_normal; - int status_change_nid_high; - int status_change_nid; - } +Especially, there is a possible lock inversion that is avoided using +device_hotplug_lock when adding memory and user space tries to online that +memory faster than expected: -- start_pfn is start_pfn of online/offline memory. -- nr_pages is # of pages of online/offline memory. -- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask - is (will be) set/clear, if this is -1, then nodemask status is not changed. -- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask - is (will be) set/clear, if this is -1, then nodemask status is not changed. -- status_change_nid is set node id when N_MEMORY of nodemask is (will be) - set/clear. It means a new(memoryless) node gets new memory by online and a - node loses all memory. If this is -1, then nodemask status is not changed. +- device_online() will first take the device_lock(), followed by + mem_hotplug_lock +- add_memory_resource() will first take the mem_hotplug_lock, followed by + the device_lock() (while creating the devices, during bus_add_device()). - If status_changed_nid* >= 0, callback should create/discard structures for the - node if necessary. +As the device is visible to user space before taking the device_lock(), this +can result in a lock inversion. -The callback routine shall return one of the values -NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP -defined in include/linux/notifier.h +onlining/offlining of memory should be done via device_online()/ +device_offline() - to make sure it is properly synchronized to actions +via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type) -NOTIFY_DONE and NOTIFY_OK have no effect on the further processing. +When adding/removing/onlining/offlining memory or adding/removing +heterogeneous/device memory, we should always hold the mem_hotplug_lock in +write mode to serialise memory hotplug (e.g. access to global/zone +variables). -NOTIFY_BAD is used as response to the MEM_GOING_ONLINE, MEM_GOING_OFFLINE, -MEM_ONLINE, or MEM_OFFLINE action to cancel hotplugging. It stops -further processing of the notification queue. +In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read +mode allows for a quite efficient get_online_mems/put_online_mems +implementation, so code accessing memory can protect from that memory +vanishing. -NOTIFY_STOP stops further processing of the notification queue. Future Work =========== diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 8f1d3de449b5..ac6f5c597a56 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -465,6 +465,13 @@ Next, the following policy attributes have special meaning if policy for the time interval between the last two invocations of the driver's utilization update callback by the CPU scheduler for that CPU. +One more policy attribute is present if the `HWP feature is enabled in the +processor `_: + +``base_frequency`` + Shows the base frequency of the CPU. Any frequency above this will be + in the turbo frequency range. + The meaning of these attributes in the `passive mode `_ is the same as for other scaling drivers. diff --git a/Documentation/admin-guide/security-bugs.rst b/Documentation/admin-guide/security-bugs.rst index 30491d91e93d..164bf71149fd 100644 --- a/Documentation/admin-guide/security-bugs.rst +++ b/Documentation/admin-guide/security-bugs.rst @@ -26,23 +26,34 @@ information is helpful. Any exploit code is very helpful and will not be released without consent from the reporter unless it has already been made public. -Disclosure ----------- - -The goal of the Linux kernel security team is to work with the bug -submitter to understand and fix the bug. We prefer to publish the fix as -soon as possible, but try to avoid public discussion of the bug itself -and leave that to others. - -Publishing the fix may be delayed when the bug or the fix is not yet -fully understood, the solution is not well-tested or for vendor -coordination. However, we expect these delays to be short, measurable in -days, not weeks or months. A release date is negotiated by the security -team working with the bug submitter as well as vendors. However, the -kernel security team holds the final say when setting a timeframe. The -timeframe varies from immediate (esp. if it's already publicly known bug) -to a few weeks. As a basic default policy, we expect report date to -release date to be on the order of 7 days. +Disclosure and embargoed information +------------------------------------ + +The security list is not a disclosure channel. For that, see Coordination +below. + +Once a robust fix has been developed, our preference is to release the +fix in a timely fashion, treating it no differently than any of the other +thousands of changes and fixes the Linux kernel project releases every +month. + +However, at the request of the reporter, we will postpone releasing the +fix for up to 5 business days after the date of the report or after the +embargo has lifted; whichever comes first. The only exception to that +rule is if the bug is publicly known, in which case the preference is to +release the fix as soon as it's available. + +Whilst embargoed information may be shared with trusted individuals in +order to develop a fix, such information will not be published alongside +the fix or on any other disclosure channel without the permission of the +reporter. This includes but is not limited to the original bug report +and followup discussions (if any), exploits, CVE information or the +identity of the reporter. + +In other words our only interest is in getting bugs fixed. All other +information submitted to the security list and any followup discussions +of the report are treated confidentially even after the embargo has been +lifted, in perpetuity. Coordination ------------ @@ -68,7 +79,7 @@ may delay the bug handling. If a reporter wishes to have a CVE identifier assigned ahead of public disclosure, they will need to contact the private linux-distros list, described above. When such a CVE identifier is known before a patch is provided, it is desirable to mention it in the commit -message, though. +message if the reporter agrees. Non-disclosure agreements ------------------------- diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX deleted file mode 100644 index b6e69fd371c4..000000000000 --- a/Documentation/arm/00-INDEX +++ /dev/null @@ -1,50 +0,0 @@ -00-INDEX - - this file -Booting - - requirements for booting -CCN.txt - - Cache Coherent Network ring-bus and perf PMU driver. -Interrupts - - ARM Interrupt subsystem documentation -IXP4xx - - Intel IXP4xx Network processor. -Netwinder - - Netwinder specific documentation -Porting - - Symbol definitions for porting Linux to a new ARM machine. -Setup - - Kernel initialization parameters on ARM Linux -README - - General ARM documentation -SA1100/ - - SA1100 documentation -Samsung-S3C24XX/ - - S3C24XX ARM Linux Overview -SPEAr/ - - ST SPEAr platform Linux Overview -VFP/ - - Release notes for Linux Kernel Vector Floating Point support code -cluster-pm-race-avoidance.txt - - Algorithm for CPU and Cluster setup/teardown -empeg/ - - Ltd's Empeg MP3 Car Audio Player -firmware.txt - - Secure firmware registration and calling. -kernel_mode_neon.txt - - How to use NEON instructions in kernel mode -kernel_user_helpers.txt - - Helper functions in kernel space made available for userspace. -mem_alignment - - alignment abort handler documentation -memory.txt - - description of the virtual memory layout -nwfpe/ - - NWFPE floating point emulator documentation -swp_emulation - - SWP/SWPB emulation handler/logging description -tcm.txt - - ARM Tightly Coupled Memory -uefi.txt - - [U]EFI configuration and runtime services documentation -vlocks.txt - - Voting locks, low-level mechanism relying on memory system atomic writes. diff --git a/Documentation/arm/Samsung/Bootloader-interface.txt b/Documentation/arm/Samsung/Bootloader-interface.txt index ed494ac0beb2..d17ed518a7ea 100644 --- a/Documentation/arm/Samsung/Bootloader-interface.txt +++ b/Documentation/arm/Samsung/Bootloader-interface.txt @@ -26,6 +26,7 @@ Offset Value Purpose 0x20 0xfcba0d10 (Magic cookie) AFTR 0x24 exynos_cpu_resume_ns AFTR 0x28 + 4*cpu 0x8 (Magic cookie, Exynos3250) AFTR +0x28 0x0 or last value during resume (Exynos542x) System suspend 2. Secure mode diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt index d6aff2c5e9e2..ea819ae024dd 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.txt @@ -78,11 +78,11 @@ HWCAP_EVTSTRM HWCAP_AES - Functionality implied by ID_AA64ISAR1_EL1.AES == 0b0001. + Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001. HWCAP_PMULL - Functionality implied by ID_AA64ISAR1_EL1.AES == 0b0010. + Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010. HWCAP_SHA1 @@ -153,7 +153,7 @@ HWCAP_ASIMDDP HWCAP_SHA512 - Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0002. + Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010. HWCAP_SVE @@ -173,8 +173,12 @@ HWCAP_USCAT HWCAP_ILRCPC - Functionality implied by ID_AA64ISR1_EL1.LRCPC == 0b0002. + Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010. HWCAP_FLAGM Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001. + +HWCAP_SSBS + + Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010. diff --git a/Documentation/arm64/hugetlbpage.txt b/Documentation/arm64/hugetlbpage.txt new file mode 100644 index 000000000000..cfae87dc653b --- /dev/null +++ b/Documentation/arm64/hugetlbpage.txt @@ -0,0 +1,38 @@ +HugeTLBpage on ARM64 +==================== + +Hugepage relies on making efficient use of TLBs to improve performance of +address translations. The benefit depends on both - + + - the size of hugepages + - size of entries supported by the TLBs + +The ARM64 port supports two flavours of hugepages. + +1) Block mappings at the pud/pmd level +-------------------------------------- + +These are regular hugepages where a pmd or a pud page table entry points to a +block of memory. Regardless of the supported size of entries in TLB, block +mappings reduce the depth of page table walk needed to translate hugepage +addresses. + +2) Using the Contiguous bit +--------------------------- + +The architecture provides a contiguous bit in the translation table entries +(D4.5.3, ARM DDI 0487C.a) that hints to the MMU to indicate that it is one of a +contiguous set of entries that can be cached in a single TLB entry. + +The contiguous bit is used in Linux to increase the mapping size at the pmd and +pte (last) level. The number of supported contiguous entries varies by page size +and level of the page table. + + +The following hugepage sizes are supported - + + CONT PTE PMD CONT PMD PUD + -------- --- -------- --- + 4K: 64K 2M 32M 1G + 16K: 2M 32M 1G + 64K: 2M 512M 16G diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 3b2f2dd82225..76ccded8b74c 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -56,6 +56,7 @@ stable kernels. | ARM | Cortex-A72 | #853709 | N/A | | ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | | ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | +| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 | | ARM | MMU-500 | #841119,#826419 | N/A | | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX deleted file mode 100644 index 8d55b4bbb5e2..000000000000 --- a/Documentation/block/00-INDEX +++ /dev/null @@ -1,34 +0,0 @@ -00-INDEX - - This file -bfq-iosched.txt - - BFQ IO scheduler and its tunables -biodoc.txt - - Notes on the Generic Block Layer Rewrite in Linux 2.5 -biovecs.txt - - Immutable biovecs and biovec iterators -capability.txt - - Generic Block Device Capability (/sys/block//capability) -cfq-iosched.txt - - CFQ IO scheduler tunables -cmdline-partition.txt - - how to specify block device partitions on kernel command line -data-integrity.txt - - Block data integrity -deadline-iosched.txt - - Deadline IO scheduler tunables -ioprio.txt - - Block io priorities (in CFQ scheduler) -pr.txt - - Block layer support for Persistent Reservations -null_blk.txt - - Null block for block-layer benchmarking. -queue-sysfs.txt - - Queue's sysfs entries -request.txt - - The members of struct request (in include/linux/blkdev.h) -stat.txt - - Block layer statistics in /sys/block//stat -switching-sched.txt - - Switching I/O schedulers at runtime -writeback_cache_control.txt - - Control of volatile write back caches diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX deleted file mode 100644 index c08df56dd91b..000000000000 --- a/Documentation/blockdev/00-INDEX +++ /dev/null @@ -1,18 +0,0 @@ -00-INDEX - - this file -README.DAC960 - - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux. -cciss.txt - - info, major/minor #'s for Compaq's SMART Array Controllers. -cpqarray.txt - - info on using Compaq's SMART2 Intelligent Disk Array Controllers. -floppy.txt - - notes and driver options for the floppy disk driver. -mflash.txt - - info on mGine m(g)flash driver for linux. -nbd.txt - - info on a TCP implementation of a network block device. -paride.txt - - information about the parallel port IDE subsystem. -ramdisk.txt - - short guide on how to set up and use the RAM disk. diff --git a/Documentation/blockdev/README.DAC960 b/Documentation/blockdev/README.DAC960 deleted file mode 100644 index bd85fb9dc6e5..000000000000 --- a/Documentation/blockdev/README.DAC960 +++ /dev/null @@ -1,756 +0,0 @@ - Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers - - Version 2.2.11 for Linux 2.2.19 - Version 2.4.11 for Linux 2.4.12 - - PRODUCTION RELEASE - - 11 October 2001 - - Leonard N. Zubkoff - Dandelion Digital - lnz@dandelion.com - - Copyright 1998-2001 by Leonard N. Zubkoff - - - INTRODUCTION - -Mylex, Inc. designs and manufactures a variety of high performance PCI RAID -controllers. Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont, -California 94555, USA and can be reached at 510.796.6100 or on the World Wide -Web at http://www.mylex.com. Mylex Technical Support can be reached by -electronic mail at mylexsup@us.ibm.com, by voice at 510.608.2400, or by FAX at -510.745.7715. Contact information for offices in Europe and Japan is available -on their Web site. - -The latest information on Linux support for DAC960 PCI RAID Controllers, as -well as the most recent release of this driver, will always be available from -my Linux Home Page at URL "http://www.dandelion.com/Linux/". The Linux DAC960 -driver supports all current Mylex PCI RAID controllers including the new -eXtremeRAID 2000/3000 and AcceleRAID 352/170/160 models which have an entirely -new firmware interface from the older eXtremeRAID 1100, AcceleRAID 150/200/250, -and DAC960PJ/PG/PU/PD/PL. See below for a complete controller list as well as -minimum firmware version requirements. For simplicity, in most places this -documentation refers to DAC960 generically rather than explicitly listing all -the supported models. - -Driver bug reports should be sent via electronic mail to "lnz@dandelion.com". -Please include with the bug report the complete configuration messages reported -by the driver at startup, along with any subsequent system messages relevant to -the controller's operation, and a detailed description of your system's -hardware configuration. Driver bugs are actually quite rare; if you encounter -problems with disks being marked offline, for example, please contact Mylex -Technical Support as the problem is related to the hardware configuration -rather than the Linux driver. - -Please consult the RAID controller documentation for detailed information -regarding installation and configuration of the controllers. This document -primarily provides information specific to the Linux support. - - - DRIVER FEATURES - -The DAC960 RAID controllers are supported solely as high performance RAID -controllers, not as interfaces to arbitrary SCSI devices. The Linux DAC960 -driver operates at the block device level, the same level as the SCSI and IDE -drivers. Unlike other RAID controllers currently supported on Linux, the -DAC960 driver is not dependent on the SCSI subsystem, and hence avoids all the -complexity and unnecessary code that would be associated with an implementation -as a SCSI driver. The DAC960 driver is designed for as high a performance as -possible with no compromises or extra code for compatibility with lower -performance devices. The DAC960 driver includes extensive error logging and -online configuration management capabilities. Except for initial configuration -of the controller and adding new disk drives, most everything can be handled -from Linux while the system is operational. - -The DAC960 driver is architected to support up to 8 controllers per system. -Each DAC960 parallel SCSI controller can support up to 15 disk drives per -channel, for a maximum of 60 drives on a four channel controller; the fibre -channel eXtremeRAID 3000 controller supports up to 125 disk drives per loop for -a total of 250 drives. The drives installed on a controller are divided into -one or more "Drive Groups", and then each Drive Group is subdivided further -into 1 to 32 "Logical Drives". Each Logical Drive has a specific RAID Level -and caching policy associated with it, and it appears to Linux as a single -block device. Logical Drives are further subdivided into up to 7 partitions -through the normal Linux and PC disk partitioning schemes. Logical Drives are -also known as "System Drives", and Drive Groups are also called "Packs". Both -terms are in use in the Mylex documentation; I have chosen to standardize on -the more generic "Logical Drive" and "Drive Group". - -DAC960 RAID disk devices are named in the style of the obsolete Device File -System (DEVFS). The device corresponding to Logical Drive D on Controller C -is referred to as /dev/rd/cCdD, and the partitions are called /dev/rd/cCdDp1 -through /dev/rd/cCdDp7. For example, partition 3 of Logical Drive 5 on -Controller 2 is referred to as /dev/rd/c2d5p3. Note that unlike with SCSI -disks the device names will not change in the event of a disk drive failure. -The DAC960 driver is assigned major numbers 48 - 55 with one major number per -controller. The 8 bits of minor number are divided into 5 bits for the Logical -Drive and 3 bits for the partition. - - - SUPPORTED DAC960/AcceleRAID/eXtremeRAID PCI RAID CONTROLLERS - -The following list comprises the supported DAC960, AcceleRAID, and eXtremeRAID -PCI RAID Controllers as of the date of this document. It is recommended that -anyone purchasing a Mylex PCI RAID Controller not in the following table -contact the author beforehand to verify that it is or will be supported. - -eXtremeRAID 3000 - 1 Wide Ultra-2/LVD SCSI channel - 2 External Fibre FC-AL channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -eXtremeRAID 2000 - 4 Wide Ultra-160 LVD SCSI channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -AcceleRAID 352 - 2 Wide Ultra-160 LVD SCSI channels - 100MHz Intel i960RN RISC Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -AcceleRAID 170 - 1 Wide Ultra-160 LVD SCSI channel - 100MHz Intel i960RM RISC Processor - 16MB/32MB/64MB ECC SDRAM Memory - -AcceleRAID 160 (AcceleRAID 170LP) - 1 Wide Ultra-160 LVD SCSI channel - 100MHz Intel i960RS RISC Processor - Built in 16M ECC SDRAM Memory - PCI Low Profile Form Factor - fit for 2U height - -eXtremeRAID 1100 (DAC1164P) - 3 Wide Ultra-2/LVD SCSI channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 16MB/32MB/64MB Parity SDRAM Memory with Battery Backup - -AcceleRAID 250 (DAC960PTL1) - Uses onboard Symbios SCSI chips on certain motherboards - Also includes one onboard Wide Ultra-2/LVD SCSI Channel - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -AcceleRAID 200 (DAC960PTL0) - Uses onboard Symbios SCSI chips on certain motherboards - Includes no onboard SCSI Channels - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -AcceleRAID 150 (DAC960PRL) - Uses onboard Symbios SCSI chips on certain motherboards - Also includes one onboard Wide Ultra-2/LVD SCSI Channel - 33MHz Intel i960RP RISC Processor - 4MB Parity EDO Memory - -DAC960PJ 1/2/3 Wide Ultra SCSI-3 Channels - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -DAC960PG 1/2/3 Wide Ultra SCSI-3 Channels - 33MHz Intel i960RP RISC Processor - 4MB/8MB ECC EDO Memory - -DAC960PU 1/2/3 Wide Ultra SCSI-3 Channels - Intel i960CF RISC Processor - 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960PD 1/2/3 Wide Fast SCSI-2 Channels - Intel i960CF RISC Processor - 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960PL 1/2/3 Wide Fast SCSI-2 Channels - Intel i960 RISC Processor - 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960P 1/2/3 Wide Fast SCSI-2 Channels - Intel i960 RISC Processor - 2MB/4MB/8MB/16MB/32MB DRAM Memory - -For the eXtremeRAID 2000/3000 and AcceleRAID 352/170/160, firmware version -6.00-01 or above is required. - -For the eXtremeRAID 1100, firmware version 5.06-0-52 or above is required. - -For the AcceleRAID 250, 200, and 150, firmware version 4.06-0-57 or above is -required. - -For the DAC960PJ and DAC960PG, firmware version 4.06-0-00 or above is required. - -For the DAC960PU, DAC960PD, DAC960PL, and DAC960P, either firmware version -3.51-0-04 or above is required (for dual Flash ROM controllers), or firmware -version 2.73-0-00 or above is required (for single Flash ROM controllers) - -Please note that not all SCSI disk drives are suitable for use with DAC960 -controllers, and only particular firmware versions of any given model may -actually function correctly. Similarly, not all motherboards have a BIOS that -properly initializes the AcceleRAID 250, AcceleRAID 200, AcceleRAID 150, -DAC960PJ, and DAC960PG because the Intel i960RD/RP is a multi-function device. -If in doubt, contact Mylex RAID Technical Support (mylexsup@us.ibm.com) to -verify compatibility. Mylex makes available a hard disk compatibility list at -http://www.mylex.com/support/hdcomp/hd-lists.html. - - - DRIVER INSTALLATION - -This distribution was prepared for Linux kernel version 2.2.19 or 2.4.12. - -To install the DAC960 RAID driver, you may use the following commands, -replacing "/usr/src" with wherever you keep your Linux kernel source tree: - - cd /usr/src - tar -xvzf DAC960-2.2.11.tar.gz (or DAC960-2.4.11.tar.gz) - mv README.DAC960 linux/Documentation - mv DAC960.[ch] linux/drivers/block - patch -p0 < DAC960.patch (if DAC960.patch is included) - cd linux - make config - make bzImage (or zImage) - -Then install "arch/x86/boot/bzImage" or "arch/x86/boot/zImage" as your -standard kernel, run lilo if appropriate, and reboot. - -To create the necessary devices in /dev, the "make_rd" script included in -"DAC960-Utilities.tar.gz" from http://www.dandelion.com/Linux/ may be used. -LILO 21 and FDISK v2.9 include DAC960 support; also included in this archive -are patches to LILO 20 and FDISK v2.8 that add DAC960 support, along with -statically linked executables of LILO and FDISK. This modified version of LILO -will allow booting from a DAC960 controller and/or mounting the root file -system from a DAC960. - -Red Hat Linux 6.0 and SuSE Linux 6.1 include support for Mylex PCI RAID -controllers. Installing directly onto a DAC960 may be problematic from other -Linux distributions until their installation utilities are updated. - - - INSTALLATION NOTES - -Before installing Linux or adding DAC960 logical drives to an existing Linux -system, the controller must first be configured to provide one or more logical -drives using the BIOS Configuration Utility or DACCF. Please note that since -there are only at most 6 usable partitions on each logical drive, systems -requiring more partitions should subdivide a drive group into multiple logical -drives, each of which can have up to 6 usable partitions. Also, note that with -large disk arrays it is advisable to enable the 8GB BIOS Geometry (255/63) -rather than accepting the default 2GB BIOS Geometry (128/32); failing to so do -will cause the logical drive geometry to have more than 65535 cylinders which -will make it impossible for FDISK to be used properly. The 8GB BIOS Geometry -can be enabled by configuring the DAC960 BIOS, which is accessible via Alt-M -during the BIOS initialization sequence. - -For maximum performance and the most efficient E2FSCK performance, it is -recommended that EXT2 file systems be built with a 4KB block size and 16 block -stride to match the DAC960 controller's 64KB default stripe size. The command -"mke2fs -b 4096 -R stride=16 " is appropriate. Unless there will be a -large number of small files on the file systems, it is also beneficial to add -the "-i 16384" option to increase the bytes per inode parameter thereby -reducing the file system metadata. Finally, on systems that will only be run -with Linux 2.2 or later kernels it is beneficial to enable sparse superblocks -with the "-s 1" option. - - - DAC960 ANNOUNCEMENTS MAILING LIST - -The DAC960 Announcements Mailing List provides a forum for informing Linux -users of new driver releases and other announcements regarding Linux support -for DAC960 PCI RAID Controllers. To join the mailing list, send a message to -"dac960-announce-request@dandelion.com" with the line "subscribe" in the -message body. - - - CONTROLLER CONFIGURATION AND STATUS MONITORING - -The DAC960 RAID controllers running firmware 4.06 or above include a Background -Initialization facility so that system downtime is minimized both for initial -installation and subsequent configuration of additional storage. The BIOS -Configuration Utility (accessible via Alt-R during the BIOS initialization -sequence) is used to quickly configure the controller, and then the logical -drives that have been created are available for immediate use even while they -are still being initialized by the controller. The primary need for online -configuration and status monitoring is then to avoid system downtime when disk -drives fail and must be replaced. Mylex's online monitoring and configuration -utilities are being ported to Linux and will become available at some point in -the future. Note that with a SAF-TE (SCSI Accessed Fault-Tolerant Enclosure) -enclosure, the controller is able to rebuild failed drives automatically as -soon as a drive replacement is made available. - -The primary interfaces for controller configuration and status monitoring are -special files created in the /proc/rd/... hierarchy along with the normal -system console logging mechanism. Whenever the system is operating, the DAC960 -driver queries each controller for status information every 10 seconds, and -checks for additional conditions every 60 seconds. The initial status of each -controller is always available for controller N in /proc/rd/cN/initial_status, -and the current status as of the last status monitoring query is available in -/proc/rd/cN/current_status. In addition, status changes are also logged by the -driver to the system console and will appear in the log files maintained by -syslog. The progress of asynchronous rebuild or consistency check operations -is also available in /proc/rd/cN/current_status, and progress messages are -logged to the system console at most every 60 seconds. - -Starting with the 2.2.3/2.0.3 versions of the driver, the status information -available in /proc/rd/cN/initial_status and /proc/rd/cN/current_status has been -augmented to include the vendor, model, revision, and serial number (if -available) for each physical device found connected to the controller: - -***** DAC960 RAID Driver Version 2.2.3 of 19 August 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PRL PCI RAID Controller - Firmware Version: 4.07-0-07, Channels: 1, Memory Size: 16MB - PCI Bus: 1, Device: 4, Function: 1, I/O Address: Unassigned - PCI Address: 0xFE300000 mapped at 0xA0800000, IRQ Channel: 21 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - SAF-TE Enclosure Management Enabled - Physical Devices: - 0:0 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68016775HA - Disk Status: Online, 17928192 blocks - 0:1 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68004E53HA - Disk Status: Online, 17928192 blocks - 0:2 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 13013935HA - Disk Status: Online, 17928192 blocks - 0:3 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 13016897HA - Disk Status: Online, 17928192 blocks - 0:4 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68019905HA - Disk Status: Online, 17928192 blocks - 0:5 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68012753HA - Disk Status: Online, 17928192 blocks - 0:6 Vendor: ESG-SHV Model: SCA HSBP M6 Revision: 0.61 - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 89640960 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -To simplify the monitoring process for custom software, the special file -/proc/rd/status returns "OK" when all DAC960 controllers in the system are -operating normally and no failures have occurred, or "ALERT" if any logical -drives are offline or critical or any non-standby physical drives are dead. - -Configuration commands for controller N are available via the special file -/proc/rd/cN/user_command. A human readable command can be written to this -special file to initiate a configuration operation, and the results of the -operation can then be read back from the special file in addition to being -logged to the system console. The shell command sequence - - echo "" > /proc/rd/c0/user_command - cat /proc/rd/c0/user_command - -is typically used to execute configuration commands. The configuration -commands are: - - flush-cache - - The "flush-cache" command flushes the controller's cache. The system - automatically flushes the cache at shutdown or if the driver module is - unloaded, so this command is only needed to be certain a write back cache - is flushed to disk before the system is powered off by a command to a UPS. - Note that the flush-cache command also stops an asynchronous rebuild or - consistency check, so it should not be used except when the system is being - halted. - - kill : - - The "kill" command marks the physical drive : as DEAD. - This command is provided primarily for testing, and should not be used - during normal system operation. - - make-online : - - The "make-online" command changes the physical drive : - from status DEAD to status ONLINE. In cases where multiple physical drives - have been killed simultaneously, this command may be used to bring all but - one of them back online, after which a rebuild to the final drive is - necessary. - - Warning: make-online should only be used on a dead physical drive that is - an active part of a drive group, never on a standby drive. The command - should never be used on a dead drive that is part of a critical logical - drive; rebuild should be used if only a single drive is dead. - - make-standby : - - The "make-standby" command changes physical drive : - from status DEAD to status STANDBY. It should only be used in cases where - a dead drive was replaced after an automatic rebuild was performed onto a - standby drive. It cannot be used to add a standby drive to the controller - configuration if one was not created initially; the BIOS Configuration - Utility must be used for that currently. - - rebuild : - - The "rebuild" command initiates an asynchronous rebuild onto physical drive - :. It should only be used when a dead drive has been - replaced. - - check-consistency - - The "check-consistency" command initiates an asynchronous consistency check - of with automatic restoration. It can be used - whenever it is desired to verify the consistency of the redundancy - information. - - cancel-rebuild - cancel-consistency-check - - The "cancel-rebuild" and "cancel-consistency-check" commands cancel any - rebuild or consistency check operations previously initiated. - - - EXAMPLE I - DRIVE FAILURE WITHOUT A STANDBY DRIVE - -The following annotated logs demonstrate the controller configuration and and -online status monitoring capabilities of the Linux DAC960 Driver. The test -configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a -DAC960PJ controller. The physical drives are configured into a single drive -group without a standby drive, and the drive group has been configured into two -logical drives, one RAID-5 and one RAID-6. Note that these logs are from an -earlier version of the driver and the messages have changed somewhat with newer -releases, but the functionality remains similar. First, here is the current -status of the RAID configuration: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -The above messages indicate that everything is healthy, and /proc/rd/status -returns "OK" indicating that there are no problems with any DAC960 controller -in the system. For demonstration purposes, while I/O is active Physical Drive -1:1 is now disconnected, simulating a drive failure. The failure is noted by -the driver within 10 seconds of the controller's having detected it, and the -driver logs the following console status messages indicating that Logical -Drives 0 and 1 are now CRITICAL as a result of Physical Drive 1:1 being DEAD: - -DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:1 killed because of timeout on SCSI command -DAC960#0: Physical Drive 1:1 is now DEAD -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL - -The Sense Keys logged here are just Check Condition / Unit Attention conditions -arising from a SCSI bus reset that is forced by the controller during its error -recovery procedures. Concurrently with the above, the driver status available -from /proc/rd also reflects the drive failure. The status message in -/proc/rd/status has changed from "OK" to "ALERT": - -gwynedd:/u/lnz# cat /proc/rd/status -ALERT - -and /proc/rd/c0/current_status has been updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Dead, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -Since there are no standby drives configured, the system can continue to access -the logical drives in a performance degraded mode until the failed drive is -replaced and a rebuild operation completed to restore the redundancy of the -logical drives. Once Physical Drive 1:1 is replaced with a properly -functioning drive, or if the physical drive was killed without having failed -(e.g., due to electrical problems on the SCSI bus), the user can instruct the -controller to initiate a rebuild operation onto the newly replaced drive: - -gwynedd:/u/lnz# echo "rebuild 1:1" > /proc/rd/c0/user_command -gwynedd:/u/lnz# cat /proc/rd/c0/user_command -Rebuild of Physical Drive 1:1 Initiated - -The echo command instructs the controller to initiate an asynchronous rebuild -operation onto Physical Drive 1:1, and the status message that results from the -operation is then available for reading from /proc/rd/c0/user_command, as well -as being logged to the console by the driver. - -Within 10 seconds of this command the driver logs the initiation of the -asynchronous rebuild operation: - -DAC960#0: Rebuild of Physical Drive 1:1 Initiated -DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 -DAC960#0: Physical Drive 1:1 is now WRITE-ONLY -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 1% completed - -and /proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Write-Only, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 6% completed - -As the rebuild progresses, the current status in /proc/rd/c0/current_status is -updated every 10 seconds: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Write-Only, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 15% completed - -and every minute a progress message is logged to the console by the driver: - -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 32% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 63% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 94% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 94% completed - -Finally, the rebuild completes successfully. The driver logs the status of the -logical and physical drives and the rebuild completion: - -DAC960#0: Rebuild Completed Successfully -DAC960#0: Physical Drive 1:1 is now ONLINE -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE - -/proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru - Rebuild Completed Successfully - -and /proc/rd/status indicates that everything is healthy once again: - -gwynedd:/u/lnz# cat /proc/rd/status -OK - - - EXAMPLE II - DRIVE FAILURE WITH A STANDBY DRIVE - -The following annotated logs demonstrate the controller configuration and and -online status monitoring capabilities of the Linux DAC960 Driver. The test -configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a -DAC960PJ controller. The physical drives are configured into a single drive -group with a standby drive, and the drive group has been configured into two -logical drives, one RAID-5 and one RAID-6. Note that these logs are from an -earlier version of the driver and the messages have changed somewhat with newer -releases, but the functionality remains similar. First, here is the current -status of the RAID configuration: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Standby, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -The above messages indicate that everything is healthy, and /proc/rd/status -returns "OK" indicating that there are no problems with any DAC960 controller -in the system. For demonstration purposes, while I/O is active Physical Drive -1:2 is now disconnected, simulating a drive failure. The failure is noted by -the driver within 10 seconds of the controller's having detected it, and the -driver logs the following console status messages: - -DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:2 killed because of timeout on SCSI command -DAC960#0: Physical Drive 1:2 is now DEAD -DAC960#0: Physical Drive 1:2 killed because it was removed -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL - -Since a standby drive is configured, the controller automatically begins -rebuilding onto the standby drive: - -DAC960#0: Physical Drive 1:3 is now WRITE-ONLY -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed - -Concurrently with the above, the driver status available from /proc/rd also -reflects the drive failure and automatic rebuild. The status message in -/proc/rd/status has changed from "OK" to "ALERT": - -gwynedd:/u/lnz# cat /proc/rd/status -ALERT - -and /proc/rd/c0/current_status has been updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Write-Only, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed - -As the rebuild progresses, the current status in /proc/rd/c0/current_status is -updated every 10 seconds: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Write-Only, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed - -and every minute a progress message is logged on the console by the driver: - -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 76% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 66% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 84% completed - -Finally, the rebuild completes successfully. The driver logs the status of the -logical and physical drives and the rebuild completion: - -DAC960#0: Rebuild Completed Successfully -DAC960#0: Physical Drive 1:3 is now ONLINE -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE - -/proc/rd/c0/current_status is updated: - -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - Rebuild Completed Successfully - -and /proc/rd/status indicates that everything is healthy once again: - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -Note that the absence of a viable standby drive does not create an "ALERT" -status. Once dead Physical Drive 1:2 has been replaced, the controller must be -told that this has occurred and that the newly replaced drive should become the -new standby drive: - -gwynedd:/u/lnz# echo "make-standby 1:2" > /proc/rd/c0/user_command -gwynedd:/u/lnz# cat /proc/rd/c0/user_command -Make Standby of Physical Drive 1:2 Succeeded - -The echo command instructs the controller to make Physical Drive 1:2 into a -standby drive, and the status message that results from the operation is then -available for reading from /proc/rd/c0/user_command, as well as being logged to -the console by the driver. Within 60 seconds of this command the driver logs: - -DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 -DAC960#0: Physical Drive 1:2 is now STANDBY -DAC960#0: Make Standby of Physical Drive 1:2 Succeeded - -and /proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Standby, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - Rebuild Completed Successfully diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 875b2b56b87f..3c1b5ab54bc0 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -190,7 +190,7 @@ whitespace: notify_free Depending on device usage scenario it may account a) the number of pages freed because of swap slot free notifications or b) the number of pages freed because of - REQ_DISCARD requests sent by bio. The former ones are + REQ_OP_DISCARD requests sent by bio. The former ones are sent to a swap block device when a swap slot is freed, which implies that this disk is being used as a swap disk. The latter ones are sent by filesystem mounted with diff --git a/Documentation/cdrom/00-INDEX b/Documentation/cdrom/00-INDEX deleted file mode 100644 index 433edf23dc49..000000000000 --- a/Documentation/cdrom/00-INDEX +++ /dev/null @@ -1,11 +0,0 @@ -00-INDEX - - this file (info on CD-ROMs and Linux) -Makefile - - only used to generate TeX output from the documentation. -cdrom-standard.tex - - LaTeX document on standardizing the CD-ROM programming interface. -ide-cd - - info on setting up and using ATAPI (aka IDE) CD-ROMs. -packet-writing.txt - - Info on the CDRW packet writing module - diff --git a/Documentation/cgroup-v1/00-INDEX b/Documentation/cgroup-v1/00-INDEX deleted file mode 100644 index 13e0c85e7b35..000000000000 --- a/Documentation/cgroup-v1/00-INDEX +++ /dev/null @@ -1,26 +0,0 @@ -00-INDEX - - this file -blkio-controller.txt - - Description for Block IO Controller, implementation and usage details. -cgroups.txt - - Control Groups definition, implementation details, examples and API. -cpuacct.txt - - CPU Accounting Controller; account CPU usage for groups of tasks. -cpusets.txt - - documents the cpusets feature; assign CPUs and Mem to a set of tasks. -admin-guide/devices.rst - - Device Whitelist Controller; description, interface and security. -freezer-subsystem.txt - - checkpointing; rationale to not use signals, interface. -hugetlb.txt - - HugeTLB Controller implementation and usage details. -memcg_test.txt - - Memory Resource Controller; implementation details. -memory.txt - - Memory Resource Controller; design, accounting, interface, testing. -net_cls.txt - - Network classifier cgroups details and usages. -net_prio.txt - - Network priority cgroups details and usages. -pids.txt - - Process number cgroups details and usages. diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt index af618171e0eb..9bdb7fd03f83 100644 --- a/Documentation/cgroup-v1/rdma.txt +++ b/Documentation/cgroup-v1/rdma.txt @@ -27,7 +27,7 @@ cgroup. Currently user space applications can easily take away all the rdma verb specific resources such as AH, CQ, QP, MR etc. Due to which other applications in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can leads to service unavailability. +rdma resources. This can lead to service unavailability. Therefore RDMA controller is needed through which resource consumption of processes can be limited. Through this controller different rdma diff --git a/Documentation/conf.py b/Documentation/conf.py index ede67ccafc29..72647a38b5c2 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -259,7 +259,7 @@ latex_elements = { 'papersize': 'a4paper', # The font size ('10pt', '11pt' or '12pt'). -'pointsize': '8pt', +'pointsize': '11pt', # Latex figure (float) alignment #'figure_align': 'htbp', @@ -272,8 +272,8 @@ latex_elements = { 'preamble': ''' % Use some font with UTF-8 support with XeLaTeX \\usepackage{fontspec} - \\setsansfont{DejaVu Serif} - \\setromanfont{DejaVu Sans} + \\setsansfont{DejaVu Sans} + \\setromanfont{DejaVu Serif} \\setmonofont{DejaVu Sans Mono} ''' diff --git a/Documentation/core-api/boot-time-mm.rst b/Documentation/core-api/boot-time-mm.rst index 03cb1643f46f..e5ec9f1a563d 100644 --- a/Documentation/core-api/boot-time-mm.rst +++ b/Documentation/core-api/boot-time-mm.rst @@ -5,54 +5,23 @@ Boot time memory management Early system initialization cannot use "normal" memory management simply because it is not set up yet. But there is still need to allocate memory for various data structures, for instance for the -physical page allocator. To address this, a specialized allocator -called the :ref:`Boot Memory Allocator `, or bootmem, was -introduced. Several years later PowerPC developers added a "Logical -Memory Blocks" allocator, which was later adopted by other -architectures and renamed to :ref:`memblock `. There is also -a compatibility layer called `nobootmem` that translates bootmem -allocation interfaces to memblock calls. +physical page allocator. -The selection of the early allocator is done using -``CONFIG_NO_BOOTMEM`` and ``CONFIG_HAVE_MEMBLOCK`` kernel -configuration options. These options are enabled or disabled -statically by the architectures' Kconfig files. - -* Architectures that rely only on bootmem select - ``CONFIG_NO_BOOTMEM=n && CONFIG_HAVE_MEMBLOCK=n``. -* The users of memblock with the nobootmem compatibility layer set - ``CONFIG_NO_BOOTMEM=y && CONFIG_HAVE_MEMBLOCK=y``. -* And for those that use both memblock and bootmem the configuration - includes ``CONFIG_NO_BOOTMEM=n && CONFIG_HAVE_MEMBLOCK=y``. - -Whichever allocator is used, it is the responsibility of the -architecture specific initialization to set it up in -:c:func:`setup_arch` and tear it down in :c:func:`mem_init` functions. +A specialized allocator called ``memblock`` performs the +boot time memory management. The architecture specific initialization +must set it up in :c:func:`setup_arch` and tear it down in +:c:func:`mem_init` functions. Once the early memory management is available it offers a variety of functions and macros for memory allocations. The allocation request may be directed to the first (and probably the only) node or to a particular node in a NUMA system. There are API variants that panic -when an allocation fails and those that don't. And more recent and -advanced memblock even allows controlling its own behaviour. - -.. _bootmem: - -Bootmem -======= - -(mostly stolen from Mel Gorman's "Understanding the Linux Virtual -Memory Manager" `book`_) - -.. _book: https://www.kernel.org/doc/gorman/ - -.. kernel-doc:: mm/bootmem.c - :doc: bootmem overview +when an allocation fails and those that don't. -.. _memblock: +Memblock also offers a variety of APIs that control its own behaviour. -Memblock -======== +Memblock Overview +================= .. kernel-doc:: mm/memblock.c :doc: memblock overview @@ -61,26 +30,6 @@ Memblock Functions and structures ======================== -Common API ----------- - -The functions that are described in this section are available -regardless of what early memory manager is enabled. - -.. kernel-doc:: mm/nobootmem.c - -Bootmem specific API --------------------- - -These interfaces available only with bootmem, i.e when ``CONFIG_NO_BOOTMEM=n`` - -.. kernel-doc:: include/linux/bootmem.h -.. kernel-doc:: mm/bootmem.c - :nodocs: - -Memblock specific API ---------------------- - Here is the description of memblock data structures, functions and macros. Some of them are actually internal, but since they are documented it would be silly to omit them. Besides, reading the @@ -89,4 +38,4 @@ really happens under the hood. .. kernel-doc:: include/linux/memblock.h .. kernel-doc:: mm/memblock.c - :nodocs: + :functions: diff --git a/Documentation/core-api/gfp_mask-from-fs-io.rst b/Documentation/core-api/gfp_mask-from-fs-io.rst index e0df8f416582..e7c32a8de126 100644 --- a/Documentation/core-api/gfp_mask-from-fs-io.rst +++ b/Documentation/core-api/gfp_mask-from-fs-io.rst @@ -1,3 +1,5 @@ +.. _gfp_mask_from_fs_io: + ================================= GFP masks used from FS/IO context ================================= diff --git a/Documentation/core-api/idr.rst b/Documentation/core-api/idr.rst index d351e880a2f6..a2738050c4f0 100644 --- a/Documentation/core-api/idr.rst +++ b/Documentation/core-api/idr.rst @@ -1,4 +1,4 @@ -.. SPDX-License-Identifier: CC-BY-SA-4.0 +.. SPDX-License-Identifier: GPL-2.0+ ============= ID Allocation diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 26b735cefb93..3adee82be311 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -21,16 +21,20 @@ Core utilities local_ops workqueue genericirq + xarray flexible-arrays librs genalloc errseq printk-formats circular-buffers + memory-allocation mm-api gfp_mask-from-fs-io timekeeping boot-time-mm + memory-hotplug + Interfaces for kernel debugging =============================== diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst new file mode 100644 index 000000000000..f8bb9aa120c4 --- /dev/null +++ b/Documentation/core-api/memory-allocation.rst @@ -0,0 +1,122 @@ +======================= +Memory Allocation Guide +======================= + +Linux provides a variety of APIs for memory allocation. You can +allocate small chunks using `kmalloc` or `kmem_cache_alloc` families, +large virtually contiguous areas using `vmalloc` and its derivatives, +or you can directly request pages from the page allocator with +`alloc_pages`. It is also possible to use more specialized allocators, +for instance `cma_alloc` or `zs_malloc`. + +Most of the memory allocation APIs use GFP flags to express how that +memory should be allocated. The GFP acronym stands for "get free +pages", the underlying memory allocation function. + +Diversity of the allocation APIs combined with the numerous GFP flags +makes the question "How should I allocate memory?" not that easy to +answer, although very likely you should use + +:: + + kzalloc(, GFP_KERNEL); + +Of course there are cases when other allocation APIs and different GFP +flags must be used. + +Get Free Page flags +=================== + +The GFP flags control the allocators behavior. They tell what memory +zones can be used, how hard the allocator should try to find free +memory, whether the memory can be accessed by the userspace etc. The +:ref:`Documentation/core-api/mm-api.rst ` provides +reference documentation for the GFP flags and their combinations and +here we briefly outline their recommended usage: + + * Most of the time ``GFP_KERNEL`` is what you need. Memory for the + kernel data structures, DMAable memory, inode cache, all these and + many other allocations types can use ``GFP_KERNEL``. Note, that + using ``GFP_KERNEL`` implies ``GFP_RECLAIM``, which means that + direct reclaim may be triggered under memory pressure; the calling + context must be allowed to sleep. + * If the allocation is performed from an atomic context, e.g interrupt + handler, use ``GFP_NOWAIT``. This flag prevents direct reclaim and + IO or filesystem operations. Consequently, under memory pressure + ``GFP_NOWAIT`` allocation is likely to fail. Allocations which + have a reasonable fallback should be using ``GFP_NOWARN``. + * If you think that accessing memory reserves is justified and the kernel + will be stressed unless allocation succeeds, you may use ``GFP_ATOMIC``. + * Untrusted allocations triggered from userspace should be a subject + of kmem accounting and must have ``__GFP_ACCOUNT`` bit set. There + is the handy ``GFP_KERNEL_ACCOUNT`` shortcut for ``GFP_KERNEL`` + allocations that should be accounted. + * Userspace allocations should use either of the ``GFP_USER``, + ``GFP_HIGHUSER`` or ``GFP_HIGHUSER_MOVABLE`` flags. The longer + the flag name the less restrictive it is. + + ``GFP_HIGHUSER_MOVABLE`` does not require that allocated memory + will be directly accessible by the kernel and implies that the + data is movable. + + ``GFP_HIGHUSER`` means that the allocated memory is not movable, + but it is not required to be directly accessible by the kernel. An + example may be a hardware allocation that maps data directly into + userspace but has no addressing limitations. + + ``GFP_USER`` means that the allocated memory is not movable and it + must be directly accessible by the kernel. + +You may notice that quite a few allocations in the existing code +specify ``GFP_NOIO`` or ``GFP_NOFS``. Historically, they were used to +prevent recursion deadlocks caused by direct memory reclaim calling +back into the FS or IO paths and blocking on already held +resources. Since 4.12 the preferred way to address this issue is to +use new scope APIs described in +:ref:`Documentation/core-api/gfp_mask-from-fs-io.rst `. + +Other legacy GFP flags are ``GFP_DMA`` and ``GFP_DMA32``. They are +used to ensure that the allocated memory is accessible by hardware +with limited addressing capabilities. So unless you are writing a +driver for a device with such restrictions, avoid using these flags. +And even with hardware with restrictions it is preferable to use +`dma_alloc*` APIs. + +Selecting memory allocator +========================== + +The most straightforward way to allocate memory is to use a function +from the :c:func:`kmalloc` family. And, to be on the safe size it's +best to use routines that set memory to zero, like +:c:func:`kzalloc`. If you need to allocate memory for an array, there +are :c:func:`kmalloc_array` and :c:func:`kcalloc` helpers. + +The maximal size of a chunk that can be allocated with `kmalloc` is +limited. The actual limit depends on the hardware and the kernel +configuration, but it is a good practice to use `kmalloc` for objects +smaller than page size. + +For large allocations you can use :c:func:`vmalloc` and +:c:func:`vzalloc`, or directly request pages from the page +allocator. The memory allocated by `vmalloc` and related functions is +not physically contiguous. + +If you are not sure whether the allocation size is too large for +`kmalloc`, it is possible to use :c:func:`kvmalloc` and its +derivatives. It will try to allocate memory with `kmalloc` and if the +allocation fails it will be retried with `vmalloc`. There are +restrictions on which GFP flags can be used with `kvmalloc`; please +see :c:func:`kvmalloc_node` reference documentation. Note that +`kvmalloc` may return memory that is not physically contiguous. + +If you need to allocate many identical objects you can use the slab +cache allocator. The cache should be set up with +:c:func:`kmem_cache_create` before it can be used. Afterwards +:c:func:`kmem_cache_alloc` and its convenience wrappers can allocate +memory from that cache. + +When the allocated memory is no longer needed it must be freed. You +can use :c:func:`kvfree` for the memory allocated with `kmalloc`, +`vmalloc` and `kvmalloc`. The slab caches should be freed with +:c:func:`kmem_cache_free`. And don't forget to destroy the cache with +:c:func:`kmem_cache_destroy`. diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst new file mode 100644 index 000000000000..de7467e48067 --- /dev/null +++ b/Documentation/core-api/memory-hotplug.rst @@ -0,0 +1,125 @@ +.. _memory_hotplug: + +============== +Memory hotplug +============== + +Memory hotplug event notifier +============================= + +Hotplugging events are sent to a notification queue. + +There are six types of notification defined in ``include/linux/memory.h``: + +MEM_GOING_ONLINE + Generated before new memory becomes available in order to be able to + prepare subsystems to handle memory. The page allocator is still unable + to allocate from the new memory. + +MEM_CANCEL_ONLINE + Generated if MEM_GOING_ONLINE fails. + +MEM_ONLINE + Generated when memory has successfully brought online. The callback may + allocate pages from the new memory. + +MEM_GOING_OFFLINE + Generated to begin the process of offlining memory. Allocations are no + longer possible from the memory but some of the memory to be offlined + is still in use. The callback can be used to free memory known to a + subsystem from the indicated memory block. + +MEM_CANCEL_OFFLINE + Generated if MEM_GOING_OFFLINE fails. Memory is available again from + the memory block that we attempted to offline. + +MEM_OFFLINE + Generated after offlining memory is complete. + +A callback routine can be registered by calling:: + + hotplug_memory_notifier(callback_func, priority) + +Callback functions with higher values of priority are called before callback +functions with lower values. + +A callback function must have the following prototype:: + + int callback_func( + struct notifier_block *self, unsigned long action, void *arg); + +The first argument of the callback function (self) is a pointer to the block +of the notifier chain that points to the callback function itself. +The second argument (action) is one of the event types described above. +The third argument (arg) passes a pointer of struct memory_notify:: + + struct memory_notify { + unsigned long start_pfn; + unsigned long nr_pages; + int status_change_nid_normal; + int status_change_nid_high; + int status_change_nid; + } + +- start_pfn is start_pfn of online/offline memory. +- nr_pages is # of pages of online/offline memory. +- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask + is (will be) set/clear, if this is -1, then nodemask status is not changed. +- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask + is (will be) set/clear, if this is -1, then nodemask status is not changed. +- status_change_nid is set node id when N_MEMORY of nodemask is (will be) + set/clear. It means a new(memoryless) node gets new memory by online and a + node loses all memory. If this is -1, then nodemask status is not changed. + + If status_changed_nid* >= 0, callback should create/discard structures for the + node if necessary. + +The callback routine shall return one of the values +NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP +defined in ``include/linux/notifier.h`` + +NOTIFY_DONE and NOTIFY_OK have no effect on the further processing. + +NOTIFY_BAD is used as response to the MEM_GOING_ONLINE, MEM_GOING_OFFLINE, +MEM_ONLINE, or MEM_OFFLINE action to cancel hotplugging. It stops +further processing of the notification queue. + +NOTIFY_STOP stops further processing of the notification queue. + +Locking Internals +================= + +When adding/removing memory that uses memory block devices (i.e. ordinary RAM), +the device_hotplug_lock should be held to: + +- synchronize against online/offline requests (e.g. via sysfs). This way, memory + block devices can only be accessed (.online/.state attributes) by user + space once memory has been fully added. And when removing memory, we + know nobody is in critical sections. +- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC) + +Especially, there is a possible lock inversion that is avoided using +device_hotplug_lock when adding memory and user space tries to online that +memory faster than expected: + +- device_online() will first take the device_lock(), followed by + mem_hotplug_lock +- add_memory_resource() will first take the mem_hotplug_lock, followed by + the device_lock() (while creating the devices, during bus_add_device()). + +As the device is visible to user space before taking the device_lock(), this +can result in a lock inversion. + +onlining/offlining of memory should be done via device_online()/ +device_offline() - to make sure it is properly synchronized to actions +via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type) + +When adding/removing/onlining/offlining memory or adding/removing +heterogeneous/device memory, we should always hold the mem_hotplug_lock in +write mode to serialise memory hotplug (e.g. access to global/zone +variables). + +In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read +mode allows for a quite efficient get_online_mems/put_online_mems +implementation, so code accessing memory can protect from that memory +vanishing. diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 46ae3537fb12..5ce1ec1dd066 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -14,6 +14,8 @@ User Space Memory Access .. kernel-doc:: mm/util.c :functions: get_user_pages_fast +.. _mm-api-gfp-flags: + Memory Allocation Controls ========================== diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 25dc591cb110..ff48b55040ef 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -376,15 +376,15 @@ correctness of the format string and va_list arguments. Passed by reference. -kobjects --------- +Device tree nodes +----------------- :: %pOF[fnpPcCF] -For printing kobject based structs (device nodes). Default behaviour is +For printing device tree node structures. Default behaviour is equivalent to %pOFf. - f - device node full_name @@ -420,9 +420,8 @@ struct clk %pC pll1 %pCn pll1 -For printing struct clk structures. %pC and %pCn print the name -(Common Clock Framework) or address (legacy clock framework) of the -structure. +For printing struct clk structures. %pC and %pCn print the name of the clock +(Common Clock Framework) or a unique 32-bit ID (legacy clock framework). Passed by reference. diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst new file mode 100644 index 000000000000..a4e705108f42 --- /dev/null +++ b/Documentation/core-api/xarray.rst @@ -0,0 +1,435 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +====== +XArray +====== + +:Author: Matthew Wilcox + +Overview +======== + +The XArray is an abstract data type which behaves like a very large array +of pointers. It meets many of the same needs as a hash or a conventional +resizable array. Unlike a hash, it allows you to sensibly go to the +next or previous entry in a cache-efficient manner. In contrast to a +resizable array, there is no need to copy data or change MMU mappings in +order to grow the array. It is more memory-efficient, parallelisable +and cache friendly than a doubly-linked list. It takes advantage of +RCU to perform lookups without locking. + +The XArray implementation is efficient when the indices used are densely +clustered; hashing the object and using the hash as the index will not +perform well. The XArray is optimised for small indices, but still has +good performance with large indices. If your index can be larger than +``ULONG_MAX`` then the XArray is not the data type for you. The most +important user of the XArray is the page cache. + +Each non-``NULL`` entry in the array has three bits associated with +it called marks. Each mark may be set or cleared independently of +the others. You can iterate over entries which are marked. + +Normal pointers may be stored in the XArray directly. They must be 4-byte +aligned, which is true for any pointer returned from :c:func:`kmalloc` and +:c:func:`alloc_page`. It isn't true for arbitrary user-space pointers, +nor for function pointers. You can store pointers to statically allocated +objects, as long as those objects have an alignment of at least 4. + +You can also store integers between 0 and ``LONG_MAX`` in the XArray. +You must first convert it into an entry using :c:func:`xa_mk_value`. +When you retrieve an entry from the XArray, you can check whether it is +a value entry by calling :c:func:`xa_is_value`, and convert it back to +an integer by calling :c:func:`xa_to_value`. + +Some users want to store tagged pointers instead of using the marks +described above. They can call :c:func:`xa_tag_pointer` to create an +entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry +back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve +the tag of an entry. Tagged pointers use the same bits that are used +to distinguish value entries from normal pointers, so each user must +decide whether they want to store value entries or tagged pointers in +any particular XArray. + +The XArray does not support storing :c:func:`IS_ERR` pointers as some +conflict with value entries or internal entries. + +An unusual feature of the XArray is the ability to create entries which +occupy a range of indices. Once stored to, looking up any index in +the range will return the same entry as looking up any other index in +the range. Setting a mark on one index will set it on all of them. +Storing to any index will store to all of them. Multi-index entries can +be explicitly split into smaller entries, or storing ``NULL`` into any +entry will cause the XArray to forget about the range. + +Normal API +========== + +Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY` +for statically allocated XArrays or :c:func:`xa_init` for dynamically +allocated ones. A freshly-initialised XArray contains a ``NULL`` +pointer at every index. + +You can then set entries using :c:func:`xa_store` and get entries +using :c:func:`xa_load`. xa_store will overwrite any entry with the +new entry and return the previous entry stored at that index. You can +use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a +``NULL`` entry. There is no difference between an entry that has never +been stored to and one that has most recently had ``NULL`` stored to it. + +You can conditionally replace an entry at an index by using +:c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if +the entry at that index has the 'old' value. It also returns the entry +which was at that index; if it returns the same entry which was passed as +'old', then :c:func:`xa_cmpxchg` succeeded. + +If you want to only store a new entry to an index if the current entry +at that index is ``NULL``, you can use :c:func:`xa_insert` which +returns ``-EEXIST`` if the entry is not empty. + +You can enquire whether a mark is set on an entry by using +:c:func:`xa_get_mark`. If the entry is not ``NULL``, you can set a mark +on it by using :c:func:`xa_set_mark` and remove the mark from an entry by +calling :c:func:`xa_clear_mark`. You can ask whether any entry in the +XArray has a particular mark set by calling :c:func:`xa_marked`. + +You can copy entries out of the XArray into a plain array by calling +:c:func:`xa_extract`. Or you can iterate over the present entries in +the XArray by calling :c:func:`xa_for_each`. You may prefer to use +:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present +entry in the XArray. + +Calling :c:func:`xa_store_range` stores the same entry in a range +of indices. If you do this, some of the other operations will behave +in a slightly odd way. For example, marking the entry at one index +may result in the entry being marked at some, but not all of the other +indices. Storing into one index may result in the entry retrieved by +some, but not all of the other indices changing. + +Finally, you can remove all entries from an XArray by calling +:c:func:`xa_destroy`. If the XArray entries are pointers, you may wish +to free the entries first. You can do this by iterating over all present +entries in the XArray using the :c:func:`xa_for_each` iterator. + +ID assignment +------------- + +You can call :c:func:`xa_alloc` to store the entry at any unused index +in the XArray. If you need to modify the array from interrupt context, +you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable +interrupts while allocating the ID. Unlike :c:func:`xa_store`, allocating +a ``NULL`` pointer does not delete an entry. Instead it reserves an +entry like :c:func:`xa_reserve` and you can release it using either +:c:func:`xa_erase` or :c:func:`xa_release`. To use ID assignment, the +XArray must be defined with :c:func:`DEFINE_XARRAY_ALLOC`, or initialised +by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`, + +Memory allocation +----------------- + +The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`, +:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t +parameter in case the XArray needs to allocate memory to store this entry. +If the entry is being deleted, no memory allocation needs to be performed, +and the GFP flags specified will be ignored. + +It is possible for no memory to be allocatable, particularly if you pass +a restrictive set of GFP flags. In that case, the functions return a +special value which can be turned into an errno using :c:func:`xa_err`. +If you don't need to know exactly which error occurred, using +:c:func:`xa_is_err` is slightly more efficient. + +Locking +------- + +When using the Normal API, you do not have to worry about locking. +The XArray uses RCU and an internal spinlock to synchronise access: + +No lock needed: + * :c:func:`xa_empty` + * :c:func:`xa_marked` + +Takes RCU read lock: + * :c:func:`xa_load` + * :c:func:`xa_for_each` + * :c:func:`xa_find` + * :c:func:`xa_find_after` + * :c:func:`xa_extract` + * :c:func:`xa_get_mark` + +Takes xa_lock internally: + * :c:func:`xa_store` + * :c:func:`xa_insert` + * :c:func:`xa_erase` + * :c:func:`xa_erase_bh` + * :c:func:`xa_erase_irq` + * :c:func:`xa_cmpxchg` + * :c:func:`xa_store_range` + * :c:func:`xa_alloc` + * :c:func:`xa_alloc_bh` + * :c:func:`xa_alloc_irq` + * :c:func:`xa_destroy` + * :c:func:`xa_set_mark` + * :c:func:`xa_clear_mark` + +Assumes xa_lock held on entry: + * :c:func:`__xa_store` + * :c:func:`__xa_insert` + * :c:func:`__xa_erase` + * :c:func:`__xa_cmpxchg` + * :c:func:`__xa_alloc` + * :c:func:`__xa_set_mark` + * :c:func:`__xa_clear_mark` + +If you want to take advantage of the lock to protect the data structures +that you are storing in the XArray, you can call :c:func:`xa_lock` +before calling :c:func:`xa_load`, then take a reference count on the +object you have found before calling :c:func:`xa_unlock`. This will +prevent stores from removing the object from the array between looking +up the object and incrementing the refcount. You can also use RCU to +avoid dereferencing freed memory, but an explanation of that is beyond +the scope of this document. + +The XArray does not disable interrupts or softirqs while modifying +the array. It is safe to read the XArray from interrupt or softirq +context as the RCU lock provides enough protection. + +If, for example, you want to store entries in the XArray in process +context and then erase them in softirq context, you can do that this way:: + + void foo_init(struct foo *foo) + { + xa_init_flags(&foo->array, XA_FLAGS_LOCK_BH); + } + + int foo_store(struct foo *foo, unsigned long index, void *entry) + { + int err; + + xa_lock_bh(&foo->array); + err = xa_err(__xa_store(&foo->array, index, entry, GFP_KERNEL)); + if (!err) + foo->count++; + xa_unlock_bh(&foo->array); + return err; + } + + /* foo_erase() is only called from softirq context */ + void foo_erase(struct foo *foo, unsigned long index) + { + xa_lock(&foo->array); + __xa_erase(&foo->array, index); + foo->count--; + xa_unlock(&foo->array); + } + +If you are going to modify the XArray from interrupt or softirq context, +you need to initialise the array using :c:func:`xa_init_flags`, passing +``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``. + +The above example also shows a common pattern of wanting to extend the +coverage of the xa_lock on the store side to protect some statistics +associated with the array. + +Sharing the XArray with interrupt context is also possible, either +using :c:func:`xa_lock_irqsave` in both the interrupt handler and process +context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock` +in the interrupt handler. Some of the more common patterns have helper +functions such as :c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`. + +Sometimes you need to protect access to the XArray with a mutex because +that lock sits above another mutex in the locking hierarchy. That does +not entitle you to use functions like :c:func:`__xa_erase` without taking +the xa_lock; the xa_lock is used for lockdep validation and will be used +for other purposes in the future. + +The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also +available for situations where you look up an entry and want to atomically +set or clear a mark. It may be more efficient to use the advanced API +in this case, as it will save you from walking the tree twice. + +Advanced API +============ + +The advanced API offers more flexibility and better performance at the +cost of an interface which can be harder to use and has fewer safeguards. +No locking is done for you by the advanced API, and you are required +to use the xa_lock while modifying the array. You can choose whether +to use the xa_lock or the RCU lock while doing read-only operations on +the array. You can mix advanced and normal operations on the same array; +indeed the normal API is implemented in terms of the advanced API. The +advanced API is only available to modules with a GPL-compatible license. + +The advanced API is based around the xa_state. This is an opaque data +structure which you declare on the stack using the :c:func:`XA_STATE` +macro. This macro initialises the xa_state ready to start walking +around the XArray. It is used as a cursor to maintain the position +in the XArray and let you compose various operations together without +having to restart from the top every time. + +The xa_state is also used to store errors. You can call +:c:func:`xas_error` to retrieve the error. All operations check whether +the xa_state is in an error state before proceeding, so there's no need +for you to check for an error after each call; you can make multiple +calls in succession and only check at a convenient point. The only +errors currently generated by the XArray code itself are ``ENOMEM`` and +``EINVAL``, but it supports arbitrary errors in case you want to call +:c:func:`xas_set_err` yourself. + +If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem` +will attempt to allocate more memory using the specified gfp flags and +cache it in the xa_state for the next attempt. The idea is that you take +the xa_lock, attempt the operation and drop the lock. The operation +attempts to allocate memory while holding the lock, but it is more +likely to fail. Once you have dropped the lock, :c:func:`xas_nomem` +can try harder to allocate more memory. It will return ``true`` if it +is worth retrying the operation (i.e. that there was a memory error *and* +more memory was allocated). If it has previously allocated memory, and +that memory wasn't used, and there is no error (or some error that isn't +``ENOMEM``), then it will free the memory previously allocated. + +Internal Entries +---------------- + +The XArray reserves some entries for its own purposes. These are never +exposed through the normal API, but when using the advanced API, it's +possible to see them. Usually the best way to handle them is to pass them +to :c:func:`xas_retry`, and retry the operation if it returns ``true``. + +.. flat-table:: + :widths: 1 1 6 + + * - Name + - Test + - Usage + + * - Node + - :c:func:`xa_is_node` + - An XArray node. May be visible when using a multi-index xa_state. + + * - Sibling + - :c:func:`xa_is_sibling` + - A non-canonical entry for a multi-index entry. The value indicates + which slot in this node has the canonical entry. + + * - Retry + - :c:func:`xa_is_retry` + - This entry is currently being modified by a thread which has the + xa_lock. The node containing this entry may be freed at the end + of this RCU period. You should restart the lookup from the head + of the array. + + * - Zero + - :c:func:`xa_is_zero` + - Zero entries appear as ``NULL`` through the Normal API, but occupy + an entry in the XArray which can be used to reserve the index for + future use. + +Other internal entries may be added in the future. As far as possible, they +will be handled by :c:func:`xas_retry`. + +Additional functionality +------------------------ + +The :c:func:`xas_create_range` function allocates all the necessary memory +to store every entry in a range. It will set ENOMEM in the xa_state if +it cannot allocate memory. + +You can use :c:func:`xas_init_marks` to reset the marks on an entry +to their default state. This is usually all marks clear, unless the +XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set +and all other marks are clear. Replacing one entry with another using +:c:func:`xas_store` will not reset the marks on that entry; if you want +the marks reset, you should do that explicitly. + +The :c:func:`xas_load` will walk the xa_state as close to the entry +as it can. If you know the xa_state has already been walked to the +entry and need to check that the entry hasn't changed, you can use +:c:func:`xas_reload` to save a function call. + +If you need to move to a different index in the XArray, call +:c:func:`xas_set`. This resets the cursor to the top of the tree, which +will generally make the next operation walk the cursor to the desired +spot in the tree. If you want to move to the next or previous index, +call :c:func:`xas_next` or :c:func:`xas_prev`. Setting the index does +not walk the cursor around the array so does not require a lock to be +held, while moving to the next or previous index does. + +You can search for the next present entry using :c:func:`xas_find`. This +is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`; +if the cursor has been walked to an entry, then it will find the next +entry after the one currently referenced. If not, it will return the +entry at the index of the xa_state. Using :c:func:`xas_next_entry` to +move to the next present entry instead of :c:func:`xas_find` will save +a function call in the majority of cases at the expense of emitting more +inline code. + +The :c:func:`xas_find_marked` function is similar. If the xa_state has +not been walked, it will return the entry at the index of the xa_state, +if it is marked. Otherwise, it will return the first marked entry after +the entry referenced by the xa_state. The :c:func:`xas_next_marked` +function is the equivalent of :c:func:`xas_next_entry`. + +When iterating over a range of the XArray using :c:func:`xas_for_each` +or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop +the iteration. The :c:func:`xas_pause` function exists for this purpose. +After you have done the necessary work and wish to resume, the xa_state +is in an appropriate state to continue the iteration after the entry +you last processed. If you have interrupts disabled while iterating, +then it is good manners to pause the iteration and reenable interrupts +every ``XA_CHECK_SCHED`` entries. + +The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and +:c:func:`xas_clear_mark` functions require the xa_state cursor to have +been moved to the appropriate location in the xarray; they will do +nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set` +immediately before. + +You can call :c:func:`xas_set_update` to have a callback function +called each time the XArray updates a node. This is used by the page +cache workingset code to maintain its list of nodes which contain only +shadow entries. + +Multi-Index Entries +------------------- + +The XArray has the ability to tie multiple indices together so that +operations on one index affect all indices. For example, storing into +any index will change the value of the entry retrieved from any index. +Setting or clearing a mark on any index will set or clear the mark +on every index that is tied together. The current implementation +only allows tying ranges which are aligned powers of two together; +eg indices 64-127 may be tied together, but 2-6 may not be. This may +save substantial quantities of memory; for example tying 512 entries +together will save over 4kB. + +You can create a multi-index entry by using :c:func:`XA_STATE_ORDER` +or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`. +Calling :c:func:`xas_load` with a multi-index xa_state will walk the +xa_state to the right location in the tree, but the return value is not +meaningful, potentially being an internal entry or ``NULL`` even when there +is an entry stored within the range. Calling :c:func:`xas_find_conflict` +will return the first entry within the range or ``NULL`` if there are no +entries in the range. The :c:func:`xas_for_each_conflict` iterator will +iterate over every entry which overlaps the specified range. + +If :c:func:`xas_load` encounters a multi-index entry, the xa_index +in the xa_state will not be changed. When iterating over an XArray +or calling :c:func:`xas_find`, if the initial index is in the middle +of a multi-index entry, it will not be altered. Subsequent calls +or iterations will move the index to the first index in the range. +Each entry will only be returned once, no matter how many indices it +occupies. + +Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state +is not supported. Using either of these functions on a multi-index entry +will reveal sibling entries; these should be skipped over by the caller. + +Storing ``NULL`` into any index of a multi-index entry will set the entry +at every index to ``NULL`` and dissolve the tie. Splitting a multi-index +entry into entries occupying smaller ranges is not yet supported. + +Functions and structures +======================== + +.. kernel-doc:: include/linux/xarray.h +.. kernel-doc:: lib/xarray.c diff --git a/Documentation/crypto/asymmetric-keys.txt b/Documentation/crypto/asymmetric-keys.txt index 5969bf42562a..8763866b11cf 100644 --- a/Documentation/crypto/asymmetric-keys.txt +++ b/Documentation/crypto/asymmetric-keys.txt @@ -183,6 +183,10 @@ and looks like the following: void (*describe)(const struct key *key, struct seq_file *m); void (*destroy)(void *payload); + int (*query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); + int (*eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); int (*verify_signature)(const struct key *key, const struct public_key_signature *sig); }; @@ -207,12 +211,22 @@ There are a number of operations defined by the subtype: asymmetric key will look after freeing the fingerprint and releasing the reference on the subtype module. - (3) verify_signature(). + (3) query(). - Optional. These are the entry points for the key usage operations. - Currently there is only the one defined. If not set, the caller will be - given -ENOTSUPP. The subtype may do anything it likes to implement an - operation, including offloading to hardware. + Mandatory. This is a function for querying the capabilities of a key. + + (4) eds_op(). + + Optional. This is the entry point for the encryption, decryption and + signature creation operations (which are distinguished by the operation ID + in the parameter struct). The subtype may do anything it likes to + implement an operation, including offloading to hardware. + + (5) verify_signature(). + + Optional. This is the entry point for signature verification. The + subtype may do anything it likes to implement an operation, including + offloading to hardware. ========================== @@ -234,6 +248,8 @@ Examples of blob formats for which parsers could be implemented include: - X.509 ASN.1 stream. - Pointer to TPM key. - Pointer to UEFI key. + - PKCS#8 private key [RFC 5208]. + - PKCS#5 encrypted private key [RFC 2898]. During key instantiation each parser in the list is tried until one doesn't return -EBADMSG. diff --git a/Documentation/dev-tools/coccinelle.rst b/Documentation/dev-tools/coccinelle.rst index 94f41c290bfc..aa14f05cabb1 100644 --- a/Documentation/dev-tools/coccinelle.rst +++ b/Documentation/dev-tools/coccinelle.rst @@ -30,18 +30,29 @@ of many distributions, e.g. : - NetBSD - FreeBSD -You can get the latest version released from the Coccinelle homepage at +Some distribution packages are obsolete and it is recommended +to use the latest version released from the Coccinelle homepage at http://coccinelle.lip6.fr/ -Once you have it, run the following command:: +Or from Github at: - ./configure +https://github.com/coccinelle/coccinelle + +Once you have it, run the following commands:: + + ./autogen + ./configure make as a regular user, and install it with:: sudo make install +More detailed installation instructions to build from source can be +found at: + +https://github.com/coccinelle/coccinelle/blob/master/install.txt + Supplemental documentation --------------------------- @@ -51,6 +62,10 @@ https://bottest.wiki.kernel.org/coccicheck The wiki documentation always refers to the linux-next version of the script. +For Semantic Patch Language(SmPL) grammar documentation refer to: + +http://coccinelle.lip6.fr/documentation.php + Using Coccinelle on the Linux kernel ------------------------------------ @@ -223,7 +238,7 @@ Since coccicheck runs through make, it naturally runs from the kernel proper dir, as such the second rule above would be implied for picking up a .cocciconfig when using ``make coccicheck``. -``make coccicheck`` also supports using M= targets.If you do not supply +``make coccicheck`` also supports using M= targets. If you do not supply any M= target, it is assumed you want to target the entire kernel. The kernel coccicheck script has:: diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 6f653acea248..dad1bb8711e2 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -159,7 +159,7 @@ Contributing new tests (details) * If a test needs specific kernel config options enabled, add a config file in the test directory to enable them. - e.g: tools/testing/selftests/android/ion/config + e.g: tools/testing/selftests/android/config Test Harness ============ diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt index c43030718cef..9f0e247d0877 100644 --- a/Documentation/device-mapper/dm-flakey.txt +++ b/Documentation/device-mapper/dm-flakey.txt @@ -33,6 +33,10 @@ Optional feature parameters: All write I/O is silently ignored. Read I/O is handled correctly. + error_writes: + All write I/O is failed with an error signalled. + Read I/O is handled correctly. + corrupt_bio_byte : During , replace of the data of each matching bio with . diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt index f4ebcbaf50f3..b638d124be6a 100644 --- a/Documentation/device-mapper/log-writes.txt +++ b/Documentation/device-mapper/log-writes.txt @@ -38,7 +38,7 @@ inconsistent file system. Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as they complete as those requests will obviously bypass the device cache. -Any REQ_DISCARD requests are treated like WRITE requests. Otherwise we would +Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would have all the DISCARD requests, and then the WRITE requests and then the FLUSH request. Consider the following example: diff --git a/Documentation/devicetree/00-INDEX b/Documentation/devicetree/00-INDEX deleted file mode 100644 index 8c4102c6a5e7..000000000000 --- a/Documentation/devicetree/00-INDEX +++ /dev/null @@ -1,12 +0,0 @@ -Documentation for device trees, a data structure by which bootloaders pass -hardware layout to Linux in a device-independent manner, simplifying hardware -probing. This subsystem is maintained by Grant Likely - and has a mailing list at -https://lists.ozlabs.org/listinfo/devicetree-discuss - -00-INDEX - - this file -booting-without-of.txt - - Booting Linux without Open Firmware, describes history and format of device trees. -usage-model.txt - - How Linux uses DT and what DT aims to solve. \ No newline at end of file diff --git a/Documentation/devicetree/bindings/arm/al,alpine.txt b/Documentation/devicetree/bindings/arm/al,alpine.txt index f404a4f9b165..d00debe2e86f 100644 --- a/Documentation/devicetree/bindings/arm/al,alpine.txt +++ b/Documentation/devicetree/bindings/arm/al,alpine.txt @@ -14,75 +14,3 @@ compatible: must contain "al,alpine" ... } - -* CPU node: - -The Alpine platform includes cortex-a15 cores. -enable-method: must be "al,alpine-smp" to allow smp [1] - -Example: - -cpus { - #address-cells = <1>; - #size-cells = <0>; - enable-method = "al,alpine-smp"; - - cpu@0 { - compatible = "arm,cortex-a15"; - device_type = "cpu"; - reg = <0>; - }; - - cpu@1 { - compatible = "arm,cortex-a15"; - device_type = "cpu"; - reg = <1>; - }; - - cpu@2 { - compatible = "arm,cortex-a15"; - device_type = "cpu"; - reg = <2>; - }; - - cpu@3 { - compatible = "arm,cortex-a15"; - device_type = "cpu"; - reg = <3>; - }; -}; - - -* Alpine CPU resume registers - -The CPU resume register are used to define required resume address after -reset. - -Properties: -- compatible : Should contain "al,alpine-cpu-resume". -- reg : Offset and length of the register set for the device - -Example: - -cpu_resume { - compatible = "al,alpine-cpu-resume"; - reg = <0xfbff5ed0 0x30>; -}; - -* Alpine System-Fabric Service Registers - -The System-Fabric Service Registers allow various operation on CPU and -system fabric, like powering CPUs off. - -Properties: -- compatible : Should contain "al,alpine-sysfabric-service" and "syscon". -- reg : Offset and length of the register set for the device - -Example: - -nb_service { - compatible = "al,alpine-sysfabric-service", "syscon"; - reg = <0xfb070000 0x10000>; -}; - -[1] arm/cpu-enable-method/al,alpine-smp diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt index b5c2b5c35766..4498292b833d 100644 --- a/Documentation/devicetree/bindings/arm/amlogic.txt +++ b/Documentation/devicetree/bindings/arm/amlogic.txt @@ -57,12 +57,17 @@ Boards with the Amlogic Meson AXG A113D SoC shall have the following properties: Required root node property: compatible: "amlogic,a113d", "amlogic,meson-axg"; +Boards with the Amlogic Meson G12A S905D2 SoC shall have the following properties: + Required root node property: + compatible: "amlogic,g12a"; + Board compatible values (alphabetically, grouped by SoC): - "geniatech,atv1200" (Meson6) - "minix,neo-x8" (Meson8) + - "endless,ec100" (Meson8b) - "hardkernel,odroid-c1" (Meson8b) - "tronfy,mxq" (Meson8b) @@ -101,6 +106,8 @@ Board compatible values (alphabetically, grouped by SoC): - "amlogic,s400" (Meson axg a113d) + - "amlogic,u200" (Meson g12a s905d2) + Amlogic Meson Firmware registers Interface ------------------------------------------ diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.txt b/Documentation/devicetree/bindings/arm/atmel-at91.txt index 31220b54d85d..4bf1b4da7659 100644 --- a/Documentation/devicetree/bindings/arm/atmel-at91.txt +++ b/Documentation/devicetree/bindings/arm/atmel-at91.txt @@ -70,173 +70,3 @@ compatible: must be one of: - "atmel,samv71q19" - "atmel,samv71q20" - "atmel,samv71q21" - -Chipid required properties: -- compatible: Should be "atmel,sama5d2-chipid" -- reg : Should contain registers location and length - -PIT Timer required properties: -- compatible: Should be "atmel,at91sam9260-pit" -- reg: Should contain registers location and length -- interrupts: Should contain interrupt for the PIT which is the IRQ line - shared across all System Controller members. - -System Timer (ST) required properties: -- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd" -- reg: Should contain registers location and length -- interrupts: Should contain interrupt for the ST which is the IRQ line - shared across all System Controller members. -- clocks: phandle to input clock. -Its subnodes can be: -- watchdog: compatible should be "atmel,at91rm9200-wdt" - -RSTC Reset Controller required properties: -- compatible: Should be "atmel,-rstc". - can be "at91sam9260" or "at91sam9g45" or "sama5d3" -- reg: Should contain registers location and length -- clocks: phandle to input clock. - -Example: - - rstc@fffffd00 { - compatible = "atmel,at91sam9260-rstc"; - reg = <0xfffffd00 0x10>; - clocks = <&clk32k>; - }; - -RAMC SDRAM/DDR Controller required properties: -- compatible: Should be "atmel,at91rm9200-sdramc", "syscon" - "atmel,at91sam9260-sdramc", - "atmel,at91sam9g45-ddramc", - "atmel,sama5d3-ddramc", -- reg: Should contain registers location and length - -Examples: - - ramc0: ramc@ffffe800 { - compatible = "atmel,at91sam9g45-ddramc"; - reg = <0xffffe800 0x200>; - }; - -SHDWC Shutdown Controller - -required properties: -- compatible: Should be "atmel,-shdwc". - can be "at91sam9260", "at91sam9rl" or "at91sam9x5". -- reg: Should contain registers location and length -- clocks: phandle to input clock. - -optional properties: -- atmel,wakeup-mode: String, operation mode of the wakeup mode. - Supported values are: "none", "high", "low", "any". -- atmel,wakeup-counter: Counter on Wake-up 0 (between 0x0 and 0xf). - -optional at91sam9260 properties: -- atmel,wakeup-rtt-timer: boolean to enable Real-time Timer Wake-up. - -optional at91sam9rl properties: -- atmel,wakeup-rtc-timer: boolean to enable Real-time Clock Wake-up. -- atmel,wakeup-rtt-timer: boolean to enable Real-time Timer Wake-up. - -optional at91sam9x5 properties: -- atmel,wakeup-rtc-timer: boolean to enable Real-time Clock Wake-up. - -Example: - - shdwc@fffffd10 { - compatible = "atmel,at91sam9260-shdwc"; - reg = <0xfffffd10 0x10>; - clocks = <&clk32k>; - }; - -SHDWC SAMA5D2-Compatible Shutdown Controller - -1) shdwc node - -required properties: -- compatible: should be "atmel,sama5d2-shdwc". -- reg: should contain registers location and length -- clocks: phandle to input clock. -- #address-cells: should be one. The cell is the wake-up input index. -- #size-cells: should be zero. - -optional properties: - -- debounce-delay-us: minimum wake-up inputs debouncer period in - microseconds. It's usually a board-related property. -- atmel,wakeup-rtc-timer: boolean to enable Real-Time Clock wake-up. - -The node contains child nodes for each wake-up input that the platform uses. - -2) input nodes - -Wake-up input nodes are usually described in the "board" part of the Device -Tree. Note also that input 0 is linked to the wake-up pin and is frequently -used. - -Required properties: -- reg: should contain the wake-up input index [0 - 15]. - -Optional properties: -- atmel,wakeup-active-high: boolean, the corresponding wake-up input described - by the child, forces the wake-up of the core power supply on a high level. - The default is to be active low. - -Example: - -On the SoC side: - shdwc@f8048010 { - compatible = "atmel,sama5d2-shdwc"; - reg = <0xf8048010 0x10>; - clocks = <&clk32k>; - #address-cells = <1>; - #size-cells = <0>; - atmel,wakeup-rtc-timer; - }; - -On the board side: - shdwc@f8048010 { - debounce-delay-us = <976>; - - input@0 { - reg = <0>; - }; - - input@1 { - reg = <1>; - atmel,wakeup-active-high; - }; - }; - -Special Function Registers (SFR) - -Special Function Registers (SFR) manage specific aspects of the integrated -memory, bridge implementations, processor and other functionality not controlled -elsewhere. - -required properties: -- compatible: Should be "atmel,-sfr", "syscon" or - "atmel,-sfrbu", "syscon" - can be "sama5d3", "sama5d4" or "sama5d2". -- reg: Should contain registers location and length - - sfr@f0038000 { - compatible = "atmel,sama5d3-sfr", "syscon"; - reg = <0xf0038000 0x60>; - }; - -Security Module (SECUMOD) - -The Security Module macrocell provides all necessary secure functions to avoid -voltage, temperature, frequency and mechanical attacks on the chip. It also -embeds secure memories that can be scrambled - -required properties: -- compatible: Should be "atmel,-secumod", "syscon". - can be "sama5d2". -- reg: Should contain registers location and length - - secumod@fc040000 { - compatible = "atmel,sama5d2-secumod", "syscon"; - reg = <0xfc040000 0x100>; - }; diff --git a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt new file mode 100644 index 000000000000..4b96608ad692 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt @@ -0,0 +1,171 @@ +Atmel system registers + +Chipid required properties: +- compatible: Should be "atmel,sama5d2-chipid" +- reg : Should contain registers location and length + +PIT Timer required properties: +- compatible: Should be "atmel,at91sam9260-pit" +- reg: Should contain registers location and length +- interrupts: Should contain interrupt for the PIT which is the IRQ line + shared across all System Controller members. + +System Timer (ST) required properties: +- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd" +- reg: Should contain registers location and length +- interrupts: Should contain interrupt for the ST which is the IRQ line + shared across all System Controller members. +- clocks: phandle to input clock. +Its subnodes can be: +- watchdog: compatible should be "atmel,at91rm9200-wdt" + +RSTC Reset Controller required properties: +- compatible: Should be "atmel,-rstc". + can be "at91sam9260" or "at91sam9g45" or "sama5d3" +- reg: Should contain registers location and length +- clocks: phandle to input clock. + +Example: + + rstc@fffffd00 { + compatible = "atmel,at91sam9260-rstc"; + reg = <0xfffffd00 0x10>; + clocks = <&clk32k>; + }; + +RAMC SDRAM/DDR Controller required properties: +- compatible: Should be "atmel,at91rm9200-sdramc", "syscon" + "atmel,at91sam9260-sdramc", + "atmel,at91sam9g45-ddramc", + "atmel,sama5d3-ddramc", +- reg: Should contain registers location and length + +Examples: + + ramc0: ramc@ffffe800 { + compatible = "atmel,at91sam9g45-ddramc"; + reg = <0xffffe800 0x200>; + }; + +SHDWC Shutdown Controller + +required properties: +- compatible: Should be "atmel,-shdwc". + can be "at91sam9260", "at91sam9rl" or "at91sam9x5". +- reg: Should contain registers location and length +- clocks: phandle to input clock. + +optional properties: +- atmel,wakeup-mode: String, operation mode of the wakeup mode. + Supported values are: "none", "high", "low", "any". +- atmel,wakeup-counter: Counter on Wake-up 0 (between 0x0 and 0xf). + +optional at91sam9260 properties: +- atmel,wakeup-rtt-timer: boolean to enable Real-time Timer Wake-up. + +optional at91sam9rl properties: +- atmel,wakeup-rtc-timer: boolean to enable Real-time Clock Wake-up. +- atmel,wakeup-rtt-timer: boolean to enable Real-time Timer Wake-up. + +optional at91sam9x5 properties: +- atmel,wakeup-rtc-timer: boolean to enable Real-time Clock Wake-up. + +Example: + + shdwc@fffffd10 { + compatible = "atmel,at91sam9260-shdwc"; + reg = <0xfffffd10 0x10>; + clocks = <&clk32k>; + }; + +SHDWC SAMA5D2-Compatible Shutdown Controller + +1) shdwc node + +required properties: +- compatible: should be "atmel,sama5d2-shdwc". +- reg: should contain registers location and length +- clocks: phandle to input clock. +- #address-cells: should be one. The cell is the wake-up input index. +- #size-cells: should be zero. + +optional properties: + +- debounce-delay-us: minimum wake-up inputs debouncer period in + microseconds. It's usually a board-related property. +- atmel,wakeup-rtc-timer: boolean to enable Real-Time Clock wake-up. + +The node contains child nodes for each wake-up input that the platform uses. + +2) input nodes + +Wake-up input nodes are usually described in the "board" part of the Device +Tree. Note also that input 0 is linked to the wake-up pin and is frequently +used. + +Required properties: +- reg: should contain the wake-up input index [0 - 15]. + +Optional properties: +- atmel,wakeup-active-high: boolean, the corresponding wake-up input described + by the child, forces the wake-up of the core power supply on a high level. + The default is to be active low. + +Example: + +On the SoC side: + shdwc@f8048010 { + compatible = "atmel,sama5d2-shdwc"; + reg = <0xf8048010 0x10>; + clocks = <&clk32k>; + #address-cells = <1>; + #size-cells = <0>; + atmel,wakeup-rtc-timer; + }; + +On the board side: + shdwc@f8048010 { + debounce-delay-us = <976>; + + input@0 { + reg = <0>; + }; + + input@1 { + reg = <1>; + atmel,wakeup-active-high; + }; + }; + +Special Function Registers (SFR) + +Special Function Registers (SFR) manage specific aspects of the integrated +memory, bridge implementations, processor and other functionality not controlled +elsewhere. + +required properties: +- compatible: Should be "atmel,-sfr", "syscon" or + "atmel,-sfrbu", "syscon" + can be "sama5d3", "sama5d4" or "sama5d2". +- reg: Should contain registers location and length + + sfr@f0038000 { + compatible = "atmel,sama5d3-sfr", "syscon"; + reg = <0xf0038000 0x60>; + }; + +Security Module (SECUMOD) + +The Security Module macrocell provides all necessary secure functions to avoid +voltage, temperature, frequency and mechanical attacks on the chip. It also +embeds secure memories that can be scrambled + +required properties: +- compatible: Should be "atmel,-secumod", "syscon". + can be "sama5d2". +- reg: Should contain registers location and length + + secumod@fc040000 { + compatible = "atmel,sama5d2-secumod", "syscon"; + reg = <0xfc040000 0x100>; + }; diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt index 1e3e29a545e2..0dcc3ea5adff 100644 --- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt +++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt @@ -42,6 +42,14 @@ Raspberry Pi Compute Module Required root node properties: compatible = "raspberrypi,compute-module", "brcm,bcm2835"; +Raspberry Pi Compute Module 3 +Required root node properties: +compatible = "raspberrypi,3-compute-module", "brcm,bcm2837"; + +Raspberry Pi Compute Module 3 Lite +Required root node properties: +compatible = "raspberrypi,3-compute-module-lite", "brcm,bcm2837"; + Raspberry Pi Zero Required root node properties: compatible = "raspberrypi,model-zero", "brcm,bcm2835"; diff --git a/Documentation/devicetree/bindings/arm/coresight.txt b/Documentation/devicetree/bindings/arm/coresight.txt index 5d1ad09bafb4..f8aff65ab921 100644 --- a/Documentation/devicetree/bindings/arm/coresight.txt +++ b/Documentation/devicetree/bindings/arm/coresight.txt @@ -54,9 +54,7 @@ its hardware characteristcs. clocks the core of that coresight component. The latter clock is optional. - * port or ports: The representation of the component's port - layout using the generic DT graph presentation found in - "bindings/graph.txt". + * port or ports: see "Graph bindings for Coresight" below. * Additional required properties for System Trace Macrocells (STM): * reg: along with the physical base address and length of the register @@ -73,7 +71,7 @@ its hardware characteristcs. AMBA markee): - "arm,coresight-replicator" - * port or ports: same as above. + * port or ports: see "Graph bindings for Coresight" below. * Optional properties for ETM/PTMs: @@ -96,6 +94,20 @@ its hardware characteristcs. * interrupts : Exactly one SPI may be listed for reporting the address error +Graph bindings for Coresight +------------------------------- + +Coresight components are interconnected to create a data path for the flow of +trace data generated from the "sources" to their collection points "sink". +Each coresight component must describe the "input" and "output" connections. +The connections must be described via generic DT graph bindings as described +by the "bindings/graph.txt", where each "port" along with an "endpoint" +component represents a hardware port and the connection. + + * All output ports must be listed inside a child node named "out-ports" + * All input ports must be listed inside a child node named "in-ports". + * Port address must match the hardware port number. + Example: 1. Sinks @@ -105,10 +117,11 @@ Example: clocks = <&oscclk6a>; clock-names = "apb_pclk"; - port { - etb_in_port: endpoint@0 { - slave-mode; - remote-endpoint = <&replicator_out_port0>; + in-ports { + port { + etb_in_port: endpoint@0 { + remote-endpoint = <&replicator_out_port0>; + }; }; }; }; @@ -119,10 +132,11 @@ Example: clocks = <&oscclk6a>; clock-names = "apb_pclk"; - port { - tpiu_in_port: endpoint@0 { - slave-mode; - remote-endpoint = <&replicator_out_port1>; + in-ports { + port { + tpiu_in_port: endpoint@0 { + remote-endpoint = <&replicator_out_port1>; + }; }; }; }; @@ -133,22 +147,16 @@ Example: clocks = <&oscclk6a>; clock-names = "apb_pclk"; - ports { - #address-cells = <1>; - #size-cells = <0>; - - /* input port */ - port@0 { - reg = <0>; + in-ports { + port { etr_in_port: endpoint { - slave-mode; remote-endpoint = <&replicator2_out_port0>; }; }; + }; - /* CATU link represented by output port */ - port@1 { - reg = <1>; + out-ports { + port { etr_out_port: endpoint { remote-endpoint = <&catu_in_port>; }; @@ -163,7 +171,7 @@ Example: */ compatible = "arm,coresight-replicator"; - ports { + out-ports { #address-cells = <1>; #size-cells = <0>; @@ -181,12 +189,11 @@ Example: remote-endpoint = <&tpiu_in_port>; }; }; + }; - /* replicator input port */ - port@2 { - reg = <0>; + in-ports { + port { replicator_in_port0: endpoint { - slave-mode; remote-endpoint = <&funnel_out_port0>; }; }; @@ -199,40 +206,36 @@ Example: clocks = <&oscclk6a>; clock-names = "apb_pclk"; - ports { - #address-cells = <1>; - #size-cells = <0>; - - /* funnel output port */ - port@0 { - reg = <0>; + out-ports { + port { funnel_out_port0: endpoint { remote-endpoint = <&replicator_in_port0>; }; }; + }; - /* funnel input ports */ - port@1 { + in-ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { reg = <0>; funnel_in_port0: endpoint { - slave-mode; remote-endpoint = <&ptm0_out_port>; }; }; - port@2 { + port@1 { reg = <1>; funnel_in_port1: endpoint { - slave-mode; remote-endpoint = <&ptm1_out_port>; }; }; - port@3 { + port@2 { reg = <2>; funnel_in_port2: endpoint { - slave-mode; remote-endpoint = <&etm0_out_port>; }; }; @@ -248,9 +251,11 @@ Example: cpu = <&cpu0>; clocks = <&oscclk6a>; clock-names = "apb_pclk"; - port { - ptm0_out_port: endpoint { - remote-endpoint = <&funnel_in_port0>; + out-ports { + port { + ptm0_out_port: endpoint { + remote-endpoint = <&funnel_in_port0>; + }; }; }; }; @@ -262,9 +267,11 @@ Example: cpu = <&cpu1>; clocks = <&oscclk6a>; clock-names = "apb_pclk"; - port { - ptm1_out_port: endpoint { - remote-endpoint = <&funnel_in_port1>; + out-ports { + port { + ptm1_out_port: endpoint { + remote-endpoint = <&funnel_in_port1>; + }; }; }; }; @@ -278,9 +285,11 @@ Example: clocks = <&soc_smc50mhz>; clock-names = "apb_pclk"; - port { - stm_out_port: endpoint { - remote-endpoint = <&main_funnel_in_port2>; + out-ports { + port { + stm_out_port: endpoint { + remote-endpoint = <&main_funnel_in_port2>; + }; }; }; }; @@ -295,10 +304,11 @@ Example: clock-names = "apb_pclk"; interrupts = ; - port { - catu_in_port: endpoint { - slave-mode; - remote-endpoint = <&etr_out_port>; + in-ports { + port { + catu_in_port: endpoint { + remote-endpoint = <&etr_out_port>; + }; }; }; }; diff --git a/Documentation/devicetree/bindings/arm/cpu-capacity.txt b/Documentation/devicetree/bindings/arm/cpu-capacity.txt index 9b5685a1d15d..84262cdb8d29 100644 --- a/Documentation/devicetree/bindings/arm/cpu-capacity.txt +++ b/Documentation/devicetree/bindings/arm/cpu-capacity.txt @@ -59,9 +59,11 @@ mhz values (normalized w.r.t. the highest value found while parsing the DT). =========================================== Example 1 (ARM 64-bit, 6-cpu system, two clusters): -capacities-dmips-mhz are scaled w.r.t. 1024 (cpu@0 and cpu@1) -supposing cluster0@max-freq=1100 and custer1@max-freq=850, -final capacities are 1024 for cluster0 and 446 for cluster1 +The capacities-dmips-mhz or DMIPS/MHz values (scaled to 1024) +are 1024 and 578 for cluster0 and cluster1. Further normalization +is done by the operating system based on cluster0@max-freq=1100 and +custer1@max-freq=850, final capacities are 1024 for cluster0 and +446 for cluster1 (576*850/1100). cpus { #address-cells = <2>; diff --git a/Documentation/devicetree/bindings/arm/cpu-enable-method/al,alpine-smp b/Documentation/devicetree/bindings/arm/cpu-enable-method/al,alpine-smp index c2e0cc5e4cfd..35e5afb6d9ad 100644 --- a/Documentation/devicetree/bindings/arm/cpu-enable-method/al,alpine-smp +++ b/Documentation/devicetree/bindings/arm/cpu-enable-method/al,alpine-smp @@ -14,7 +14,28 @@ Related properties: (none) Note: This enable method requires valid nodes compatible with -"al,alpine-cpu-resume" and "al,alpine-nb-service"[1]. +"al,alpine-cpu-resume" and "al,alpine-nb-service". + + +* Alpine CPU resume registers + +The CPU resume register are used to define required resume address after +reset. + +Properties: +- compatible : Should contain "al,alpine-cpu-resume". +- reg : Offset and length of the register set for the device + + +* Alpine System-Fabric Service Registers + +The System-Fabric Service Registers allow various operation on CPU and +system fabric, like powering CPUs off. + +Properties: +- compatible : Should contain "al,alpine-sysfabric-service" and "syscon". +- reg : Offset and length of the register set for the device + Example: @@ -48,5 +69,12 @@ cpus { }; }; --- -[1] arm/al,alpine.txt +cpu_resume { + compatible = "al,alpine-cpu-resume"; + reg = <0xfbff5ed0 0x30>; +}; + +nb_service { + compatible = "al,alpine-sysfabric-service", "syscon"; + reg = <0xfb070000 0x10000>; +}; diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt index 96dfccc0faa8..b0198a1cf403 100644 --- a/Documentation/devicetree/bindings/arm/cpus.txt +++ b/Documentation/devicetree/bindings/arm/cpus.txt @@ -276,7 +276,7 @@ described below. Usage: optional Value type: Definition: A u32 value that represents the running time dynamic - power coefficient in units of mW/MHz/uV^2. The + power coefficient in units of uW/MHz/V^2. The coefficient can either be calculated from power measurements or derived by analysis. @@ -287,7 +287,7 @@ described below. Pdyn = dynamic-power-coefficient * V^2 * f - where voltage is in uV, frequency is in MHz. + where voltage is in V, frequency is in MHz. Example 1 (dual-cluster big.LITTLE system 32-bit): diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt new file mode 100644 index 000000000000..b5cb374dc47d --- /dev/null +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt @@ -0,0 +1,19 @@ +Freescale DCFG + +DCFG is the device configuration unit, that provides general purpose +configuration and status for the device. Such as setting the secondary +core start address and release the secondary core from holdoff and startup. + +Required properties: + - compatible: Should contain a chip-specific compatible string, + Chip-specific strings are of the form "fsl,-dcfg", + The following s are known to be supported: + ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. + + - reg : should contain base address and length of DCFG memory-mapped registers + +Example: + dcfg: dcfg@1ee0000 { + compatible = "fsl,ls1021a-dcfg"; + reg = <0x0 0x1ee0000 0x0 0x10000>; + }; diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt new file mode 100644 index 000000000000..0ab67b0b216d --- /dev/null +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt @@ -0,0 +1,19 @@ +Freescale SCFG + +SCFG is the supplemental configuration unit, that provides SoC specific +configuration and status registers for the chip. Such as getting PEX port +status. + +Required properties: + - compatible: Should contain a chip-specific compatible string, + Chip-specific strings are of the form "fsl,-scfg", + The following s are known to be supported: + ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. + + - reg: should contain base address and length of SCFG memory-mapped registers + +Example: + scfg: scfg@1570000 { + compatible = "fsl,ls1021a-scfg"; + reg = <0x0 0x1570000 0x0 0x10000>; + }; diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt new file mode 100644 index 000000000000..46d0af1f0872 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt @@ -0,0 +1,183 @@ +NXP i.MX System Controller Firmware (SCFW) +-------------------------------------------------------------------- + +The System Controller Firmware (SCFW) is a low-level system function +which runs on a dedicated Cortex-M core to provide power, clock, and +resource management. It exists on some i.MX8 processors. e.g. i.MX8QM +(QM, QP), and i.MX8QX (QXP, DX). + +The AP communicates with the SC using a multi-ported MU module found +in the LSIO subsystem. The current definition of this MU module provides +5 remote AP connections to the SC to support up to 5 execution environments +(TZ, HV, standard Linux, etc.). The SC side of this MU module interfaces +with the LSIO DSC IP bus. The SC firmware will communicate with this MU +using the MSI bus. + +System Controller Device Node: +============================================================ + +The scu node with the following properties shall be under the /firmware/ node. + +Required properties: +------------------- +- compatible: should be "fsl,imx-scu". +- mbox-names: should include "tx0", "tx1", "tx2", "tx3", + "rx0", "rx1", "rx2", "rx3". +- mboxes: List of phandle of 4 MU channels for tx and 4 MU channels + for rx. All 8 MU channels must be in the same MU instance. + Cross instances are not allowed. The MU instance can only + be one of LSIO MU0~M4 for imx8qxp and imx8qm. Users need + to make sure use the one which is not conflict with other + execution environments. e.g. ATF. + Note: + Channel 0 must be "tx0" or "rx0". + Channel 1 must be "tx1" or "rx1". + Channel 2 must be "tx2" or "rx2". + Channel 3 must be "tx3" or "rx3". + e.g. + mboxes = <&lsio_mu1 0 0 + &lsio_mu1 0 1 + &lsio_mu1 0 2 + &lsio_mu1 0 3 + &lsio_mu1 1 0 + &lsio_mu1 1 1 + &lsio_mu1 1 2 + &lsio_mu1 1 3>; + See Documentation/devicetree/bindings/mailbox/fsl,mu.txt + for detailed mailbox binding. + +i.MX SCU Client Device Node: +============================================================ + +Client nodes are maintained as children of the relevant IMX-SCU device node. + +Power domain bindings based on SCU Message Protocol +------------------------------------------------------------ + +This binding for the SCU power domain providers uses the generic power +domain binding[2]. + +Required properties: +- compatible: Should be "fsl,scu-pd". +- #address-cells: Should be 1. +- #size-cells: Should be 0. + +Required properties for power domain sub nodes: +- #power-domain-cells: Must be 0. + +Optional Properties: +- reg: Resource ID of this power domain. + No exist means uncontrollable by user. + See detailed Resource ID list from: + include/dt-bindings/power/imx-rsrc.h +- power-domains: phandle pointing to the parent power domain. + +Clock bindings based on SCU Message Protocol +------------------------------------------------------------ + +This binding uses the common clock binding[1]. + +Required properties: +- compatible: Should be "fsl,imx8qxp-clock". +- #clock-cells: Should be 1. Contains the Clock ID value. +- clocks: List of clock specifiers, must contain an entry for + each required entry in clock-names +- clock-names: Should include entries "xtal_32KHz", "xtal_24MHz" + +The clock consumer should specify the desired clock by having the clock +ID in its "clocks" phandle cell. + +See the full list of clock IDs from: +include/dt-bindings/clock/imx8qxp-clock.h + +Pinctrl bindings based on SCU Message Protocol +------------------------------------------------------------ + +This binding uses the i.MX common pinctrl binding[3]. + +Required properties: +- compatible: Should be "fsl,imx8qxp-iomuxc". + +Required properties for Pinctrl sub nodes: +- fsl,pins: Each entry consists of 3 integers which represents + the mux and config setting for one pin. The first 2 + integers are specified using a + PIN_FUNC_ID macro, which can be found in + . + The last integer CONFIG is the pad setting value like + pull-up on this pin. + + Please refer to i.MX8QXP Reference Manual for detailed + CONFIG settings. + +[1] Documentation/devicetree/bindings/clock/clock-bindings.txt +[2] Documentation/devicetree/bindings/power/power_domain.txt +[3] Documentation/devicetree/bindings/pinctrl/fsl,imx-pinctrl.txt + +Example (imx8qxp): +------------- +lsio_mu1: mailbox@5d1c0000 { + ... + #mbox-cells = <2>; +}; + +firmware { + scu { + compatible = "fsl,imx-scu"; + mbox-names = "tx0", "tx1", "tx2", "tx3", + "rx0", "rx1", "rx2", "rx3"; + mboxes = <&lsio_mu1 0 0 + &lsio_mu1 0 1 + &lsio_mu1 0 2 + &lsio_mu1 0 3 + &lsio_mu1 1 0 + &lsio_mu1 1 1 + &lsio_mu1 1 2 + &lsio_mu1 1 3>; + + clk: clk { + compatible = "fsl,imx8qxp-clk"; + #clock-cells = <1>; + }; + + iomuxc { + compatible = "fsl,imx8qxp-iomuxc"; + + pinctrl_lpuart0: lpuart0grp { + fsl,pins = < + SC_P_UART0_RX_ADMA_UART0_RX 0x06000020 + SC_P_UART0_TX_ADMA_UART0_TX 0x06000020 + >; + }; + ... + }; + + imx8qx-pm { + compatible = "fsl,scu-pd"; + #address-cells = <1>; + #size-cells = <0>; + + pd_dma: dma-power-domain { + #power-domain-cells = <0>; + + pd_dma_lpuart0: dma-lpuart0@57 { + reg = ; + #power-domain-cells = <0>; + power-domains = <&pd_dma>; + }; + ... + }; + ... + }; + }; +}; + +serial@5a060000 { + ... + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_lpuart0>; + clocks = <&clk IMX8QXP_UART0_CLK>, + <&clk IMX8QXP_UART0_IPG_CLK>; + clock-names = "per", "ipg"; + power-domains = <&pd_dma_lpuart0>; +}; diff --git a/Documentation/devicetree/bindings/arm/fsl.txt b/Documentation/devicetree/bindings/arm/fsl.txt index 8a1baa2b9723..5074aeecd327 100644 --- a/Documentation/devicetree/bindings/arm/fsl.txt +++ b/Documentation/devicetree/bindings/arm/fsl.txt @@ -57,6 +57,50 @@ i.MX6SLL EVK board Required root node properties: - compatible = "fsl,imx6sll-evk", "fsl,imx6sll"; +i.MX6 Quad Plus SABRE Smart Device Board +Required root node properties: + - compatible = "fsl,imx6qp-sabresd", "fsl,imx6qp"; + +i.MX6 Quad Plus SABRE Automotive Board +Required root node properties: + - compatible = "fsl,imx6qp-sabreauto", "fsl,imx6qp"; + +i.MX6 DualLite SABRE Smart Device Board +Required root node properties: + - compatible = "fsl,imx6dl-sabresd", "fsl,imx6dl"; + +i.MX6 DualLite/Solo SABRE Automotive Board +Required root node properties: + - compatible = "fsl,imx6dl-sabreauto", "fsl,imx6dl"; + +i.MX6 SoloLite EVK Board +Required root node properties: + - compatible = "fsl,imx6sl-evk", "fsl,imx6sl"; + +i.MX6 UltraLite 14x14 EVK Board +Required root node properties: + - compatible = "fsl,imx6ul-14x14-evk", "fsl,imx6ul"; + +i.MX6 UltraLiteLite 14x14 EVK Board +Required root node properties: + - compatible = "fsl,imx6ull-14x14-evk", "fsl,imx6ull"; + +i.MX6 ULZ 14x14 EVK Board +Required root node properties: + - compatible = "fsl,imx6ulz-14x14-evk", "fsl,imx6ull", "fsl,imx6ulz"; + +i.MX6 SoloX SDB Board +Required root node properties: + - compatible = "fsl,imx6sx-sdb", "fsl,imx6sx"; + +i.MX6 SoloX Sabre Auto Board +Required root node properties: + - compatible = "fsl,imx6sx-sabreauto", "fsl,imx6sx"; + +i.MX7 SabreSD Board +Required root node properties: + - compatible = "fsl,imx7d-sdb", "fsl,imx7d"; + Generic i.MX boards ------------------- @@ -101,45 +145,6 @@ Freescale LS1021A Platform Device Tree Bindings Required root node compatible properties: - compatible = "fsl,ls1021a"; -Freescale SoC-specific Device Tree Bindings -------------------------------------------- - -Freescale SCFG - SCFG is the supplemental configuration unit, that provides SoC specific -configuration and status registers for the chip. Such as getting PEX port -status. - Required properties: - - compatible: Should contain a chip-specific compatible string, - Chip-specific strings are of the form "fsl,-scfg", - The following s are known to be supported: - ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. - - - reg: should contain base address and length of SCFG memory-mapped registers - -Example: - scfg: scfg@1570000 { - compatible = "fsl,ls1021a-scfg"; - reg = <0x0 0x1570000 0x0 0x10000>; - }; - -Freescale DCFG - DCFG is the device configuration unit, that provides general purpose -configuration and status for the device. Such as setting the secondary -core start address and release the secondary core from holdoff and startup. - Required properties: - - compatible: Should contain a chip-specific compatible string, - Chip-specific strings are of the form "fsl,-dcfg", - The following s are known to be supported: - ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. - - - reg : should contain base address and length of DCFG memory-mapped registers - -Example: - dcfg: dcfg@1ee0000 { - compatible = "fsl,ls1021a-dcfg"; - reg = <0x0 0x1ee0000 0x0 0x10000>; - }; - Freescale ARMv8 based Layerscape SoC family Device Tree Bindings ---------------------------------------------------------------- diff --git a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt index 199cd36fe1ba..a97f643e7d1c 100644 --- a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt +++ b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt @@ -8,6 +8,14 @@ HiKey960 Board Required root node properties: - compatible = "hisilicon,hi3660-hikey960", "hisilicon,hi3660"; +Hi3670 SoC +Required root node properties: + - compatible = "hisilicon,hi3670"; + +HiKey970 Board +Required root node properties: + - compatible = "hisilicon,hi3670-hikey970", "hisilicon,hi3670"; + Hi3798cv200 SoC Required root node properties: - compatible = "hisilicon,hi3798cv200"; diff --git a/Documentation/devicetree/bindings/arm/keystone/ti,sci.txt b/Documentation/devicetree/bindings/arm/keystone/ti,sci.txt index 31f5f9a104cc..b56a02c10ae6 100644 --- a/Documentation/devicetree/bindings/arm/keystone/ti,sci.txt +++ b/Documentation/devicetree/bindings/arm/keystone/ti,sci.txt @@ -45,11 +45,15 @@ Optional Properties: debug_messages - Map the Debug message region - reg: register space corresponding to the debug_messages - ti,system-reboot-controller: If system reboot can be triggered by SoC reboot +- ti,host-id: Integer value corresponding to the host ID assigned by Firmware + for identification of host processing entities such as virtual + machines Example (K2G): ------------- pmmc: pmmc { compatible = "ti,k2g-sci"; + ti,host-id = <2>; mbox-names = "rx", "tx"; mboxes= <&msgmgr &msgmgr_proxy_pmmc_rx>, <&msgmgr &msgmgr_proxy_pmmc_tx>; diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt index b404d592ce58..4e4a3c0ab9ab 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt @@ -10,6 +10,7 @@ Required Properties: - "mediatek,mt2712-apmixedsys", "syscon" - "mediatek,mt6797-apmixedsys" - "mediatek,mt7622-apmixedsys" + - "mediatek,mt7623-apmixedsys", "mediatek,mt2701-apmixedsys" - "mediatek,mt8135-apmixedsys" - "mediatek,mt8173-apmixedsys" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt index 34a69ba67f13..d1606b2c3e63 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt @@ -8,6 +8,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-audsys", "syscon" - "mediatek,mt7622-audsys", "syscon" + - "mediatek,mt7623-audsys", "mediatek,mt2701-audsys", "syscon" - #clock-cells: Must be 1 The AUDSYS controller uses the common clk binding from diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,bdpsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,bdpsys.txt index 4010e37c53a0..149567a38215 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,bdpsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,bdpsys.txt @@ -8,6 +8,7 @@ Required Properties: - compatible: Should be: - "mediatek,mt2701-bdpsys", "syscon" - "mediatek,mt2712-bdpsys", "syscon" + - "mediatek,mt7623-bdpsys", "mediatek,mt2701-bdpsys", "syscon" - #clock-cells: Must be 1 The bdpsys controller uses the common clk binding from diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt index 8f5335b480ac..f17cfe64255d 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt @@ -8,6 +8,7 @@ Required Properties: - compatible: Should be: - "mediatek,mt2701-ethsys", "syscon" - "mediatek,mt7622-ethsys", "syscon" + - "mediatek,mt7623-ethsys", "mediatek,mt2701-ethsys", "syscon" - #clock-cells: Must be 1 - #reset-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt index f5629d64cef2..323905af82c3 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,hifsys.txt @@ -9,6 +9,7 @@ Required Properties: - compatible: Should be: - "mediatek,mt2701-hifsys", "syscon" - "mediatek,mt7622-hifsys", "syscon" + - "mediatek,mt7623-hifsys", "mediatek,mt2701-hifsys", "syscon" - #clock-cells: Must be 1 The hifsys controller uses the common clk binding from diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt index 868bd51a98be..3f99672163e3 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt @@ -9,6 +9,7 @@ Required Properties: - "mediatek,mt2701-imgsys", "syscon" - "mediatek,mt2712-imgsys", "syscon" - "mediatek,mt6797-imgsys", "syscon" + - "mediatek,mt7623-imgsys", "mediatek,mt2701-imgsys", "syscon" - "mediatek,mt8173-imgsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt index 566f153f9f83..89f4272a1441 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt @@ -11,6 +11,7 @@ Required Properties: - "mediatek,mt2712-infracfg", "syscon" - "mediatek,mt6797-infracfg", "syscon" - "mediatek,mt7622-infracfg", "syscon" + - "mediatek,mt7623-infracfg", "mediatek,mt2701-infracfg", "syscon" - "mediatek,mt8135-infracfg", "syscon" - "mediatek,mt8173-infracfg", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt index 4eb8bbe15c01..15d977afad31 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt @@ -9,6 +9,7 @@ Required Properties: - "mediatek,mt2701-mmsys", "syscon" - "mediatek,mt2712-mmsys", "syscon" - "mediatek,mt6797-mmsys", "syscon" + - "mediatek,mt7623-mmsys", "mediatek,mt2701-mmsys", "syscon" - "mediatek,mt8173-mmsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt index fb58ca8c2770..6755514deb80 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt @@ -10,6 +10,7 @@ Required Properties: - "mediatek,mt2701-pericfg", "syscon" - "mediatek,mt2712-pericfg", "syscon" - "mediatek,mt7622-pericfg", "syscon" + - "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon" - "mediatek,mt8135-pericfg", "syscon" - "mediatek,mt8173-pericfg", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt index 24014a7e2332..d849465b8c99 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt @@ -10,6 +10,7 @@ Required Properties: - "mediatek,mt2712-topckgen", "syscon" - "mediatek,mt6797-topckgen" - "mediatek,mt7622-topckgen" + - "mediatek,mt7623-topckgen", "mediatek,mt2701-topckgen" - "mediatek,mt8135-topckgen" - "mediatek,mt8173-topckgen" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt index ea40d05089f8..3212afc753c8 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt @@ -9,6 +9,7 @@ Required Properties: - "mediatek,mt2701-vdecsys", "syscon" - "mediatek,mt2712-vdecsys", "syscon" - "mediatek,mt6797-vdecsys", "syscon" + - "mediatek,mt7623-vdecsys", "mediatek,mt2701-vdecsys", "syscon" - "mediatek,mt8173-vdecsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt index 1333db9acfee..7f696362a4a1 100644 --- a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt +++ b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt @@ -21,10 +21,29 @@ PROPERTIES the register region. An optional second element specifies the base address and size of the alias register region. +- clocks: + Usage: required + Value type: + Definition: reference to the pll parents. + +- clock-names: + Usage: required + Value type: + Definition: must be "pll8_vote", "pxo". + +- clock-output-names: + Usage: optional + Value type: + Definition: Name of the output clock. Typically acpuX_aux where X is a + CPU number starting at 0. + Example: clock-controller@2088000 { compatible = "qcom,kpss-acc-v2"; reg = <0x02088000 0x1000>, <0x02008000 0x1000>; + clocks = <&gcc PLL8_VOTE>, <&gcc PXO_SRC>; + clock-names = "pll8_vote", "pxo"; + clock-output-names = "acpu0_aux"; }; diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt new file mode 100644 index 000000000000..e628758950e1 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt @@ -0,0 +1,44 @@ +Krait Processor Sub-system (KPSS) Global Clock Controller (GCC) + +PROPERTIES + +- compatible: + Usage: required + Value type: + Definition: should be one of the following. The generic compatible + "qcom,kpss-gcc" should also be included. + "qcom,kpss-gcc-ipq8064", "qcom,kpss-gcc" + "qcom,kpss-gcc-apq8064", "qcom,kpss-gcc" + "qcom,kpss-gcc-msm8974", "qcom,kpss-gcc" + "qcom,kpss-gcc-msm8960", "qcom,kpss-gcc" + +- reg: + Usage: required + Value type: + Definition: base address and size of the register region + +- clocks: + Usage: required + Value type: + Definition: reference to the pll parents. + +- clock-names: + Usage: required + Value type: + Definition: must be "pll8_vote", "pxo". + +- clock-output-names: + Usage: required + Value type: + Definition: Name of the output clock. Typically acpu_l2_aux indicating + an L2 cache auxiliary clock. + +Example: + + l2cc: clock-controller@2011000 { + compatible = "qcom,kpss-gcc-ipq8064", "qcom,kpss-gcc"; + reg = <0x2011000 0x1000>; + clocks = <&gcc PLL8_VOTE>, <&gcc PXO_SRC>; + clock-names = "pll8_vote", "pxo"; + clock-output-names = "acpu_l2_aux"; + }; diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.txt b/Documentation/devicetree/bindings/arm/msm/qcom,llcc.txt index 5e85749262ae..eaee06b2d8f2 100644 --- a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.txt +++ b/Documentation/devicetree/bindings/arm/msm/qcom,llcc.txt @@ -16,11 +16,26 @@ Properties: - reg: Usage: required Value Type: - Definition: Start address and the the size of the register region. + Definition: The first element specifies the llcc base start address and + the size of the register region. The second element specifies + the llcc broadcast base address and size of the register region. + +- reg-names: + Usage: required + Value Type: + Definition: Register region names. Must be "llcc_base", "llcc_broadcast_base". + +- interrupts: + Usage: required + Definition: The interrupt is associated with the llcc edac device. + It's used for llcc cache single and double bit error detection + and reporting. Example: cache-controller@1100000 { compatible = "qcom,sdm845-llcc"; - reg = <0x1100000 0x250000>; + reg = <0x1100000 0x200000>, <0x1300000 0x50000> ; + reg-names = "llcc_base", "llcc_broadcast_base"; + interrupts = ; }; diff --git a/Documentation/devicetree/bindings/arm/rockchip.txt b/Documentation/devicetree/bindings/arm/rockchip.txt index acfd3c773dd0..0cc71236d639 100644 --- a/Documentation/devicetree/bindings/arm/rockchip.txt +++ b/Documentation/devicetree/bindings/arm/rockchip.txt @@ -5,6 +5,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "vamrs,ficus", "rockchip,rk3399"; +- 96boards RK3399 Rock960 (ROCK960 Consumer Edition) + Required root node properties: + - compatible = "vamrs,rock960", "rockchip,rk3399"; + - Amarula Vyasa RK3288 board Required root node properties: - compatible = "amarula,vyasa-rk3288", "rockchip,rk3288"; @@ -13,6 +17,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "asus,rk3288-tinker", "rockchip,rk3288"; +- Asus Tinker board S + Required root node properties: + - compatible = "asus,rk3288-tinker-s", "rockchip,rk3288"; + - Kylin RK3036 board: Required root node properties: - compatible = "rockchip,kylin-rk3036", "rockchip,rk3036"; @@ -59,6 +67,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "firefly,roc-rk3328-cc", "rockchip,rk3328"; +- Firefly ROC-RK3399-PC board: + Required root node properties: + - compatible = "firefly,roc-rk3399-pc", "rockchip,rk3399"; + - ChipSPARK PopMetal-RK3288 board: Required root node properties: - compatible = "chipspark,popmetal-rk3288", "rockchip,rk3288"; @@ -160,6 +172,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "pine64,rock64", "rockchip,rk3328"; +- Pine64 RockPro64 board: + Required root node properties: + - compatible = "pine64,rockpro64", "rockchip,rk3399"; + - Rockchip PX3 Evaluation board: Required root node properties: - compatible = "rockchip,px3-evb", "rockchip,px3", "rockchip,rk3188"; @@ -168,6 +184,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "rockchip,px5-evb", "rockchip,px5", "rockchip,rk3368"; +- Rockchip PX30 Evaluation board: + Required root node properties: + - compatible = "rockchip,px30-evb", "rockchip,px30"; + - Rockchip RV1108 Evaluation board Required root node properties: - compatible = "rockchip,rv1108-evb", "rockchip,rv1108"; diff --git a/Documentation/devicetree/bindings/arm/scu.txt b/Documentation/devicetree/bindings/arm/scu.txt index 08a587875996..74d0a780ce51 100644 --- a/Documentation/devicetree/bindings/arm/scu.txt +++ b/Documentation/devicetree/bindings/arm/scu.txt @@ -22,7 +22,7 @@ References: Example: -scu@a04100000 { +scu@a0410000 { compatible = "arm,cortex-a9-scu"; reg = <0xa0410000 0x100>; }; diff --git a/Documentation/devicetree/bindings/arm/secure.txt b/Documentation/devicetree/bindings/arm/secure.txt index e31303fb233a..f27bbff2c780 100644 --- a/Documentation/devicetree/bindings/arm/secure.txt +++ b/Documentation/devicetree/bindings/arm/secure.txt @@ -32,7 +32,8 @@ describe the view of Secure world using the standard bindings. These secure- bindings only need to be used where both the Secure and Normal world views need to be described in a single device tree. -Valid Secure world properties: +Valid Secure world properties +----------------------------- - secure-status : specifies whether the device is present and usable in the secure world. The combination of this with "status" allows @@ -51,3 +52,19 @@ Valid Secure world properties: status = "disabled"; secure-status = "okay"; /* S-only */ status = "disabled"; /* disabled in both */ status = "disabled"; secure-status = "disabled"; /* disabled in both */ + +The secure-chosen node +---------------------- + +Similar to the /chosen node which serves as a place for passing data +between firmware and the operating system, the /secure-chosen node may +be used to pass data to the Secure OS. Only the properties defined +below may appear in the /secure-chosen node. + +- stdout-path : specifies the device to be used by the Secure OS for + its console output. The syntax is the same as for /chosen/stdout-path. + If the /secure-chosen node exists but the stdout-path property is not + present, the Secure OS should not perform any console output. If + /secure-chosen does not exist, the Secure OS should use the value of + /chosen/stdout-path instead (that is, use the same device as the + Normal world OS). diff --git a/Documentation/devicetree/bindings/arm/shmobile.txt b/Documentation/devicetree/bindings/arm/shmobile.txt index 89b4a389fbc7..58c4256d37a3 100644 --- a/Documentation/devicetree/bindings/arm/shmobile.txt +++ b/Documentation/devicetree/bindings/arm/shmobile.txt @@ -7,6 +7,8 @@ SoCs: compatible = "renesas,emev2" - RZ/A1H (R7S72100) compatible = "renesas,r7s72100" + - RZ/A2 (R7S9210) + compatible = "renesas,r7s9210" - SH-Mobile AG5 (R8A73A00/SH73A0) compatible = "renesas,sh73a0" - R-Mobile APE6 (R8A73A40) @@ -23,6 +25,10 @@ SoCs: compatible = "renesas,r8a7745" - RZ/G1C (R8A77470) compatible = "renesas,r8a77470" + - RZ/G2M (R8A774A1) + compatible = "renesas,r8a774a1" + - RZ/G2E (R8A774C0) + compatible = "renesas,r8a774c0" - R-Car M1A (R8A77781) compatible = "renesas,r8a7778" - R-Car H1 (R8A77790) @@ -107,6 +113,8 @@ Boards: compatible = "renesas,lager", "renesas,r8a7790" - M3ULCB (R-Car Starter Kit Pro, RTP0RC7796SKBX0010SA09 (M3 ES1.0)) compatible = "renesas,m3ulcb", "renesas,r8a7796" + - M3NULCB (R-Car Starter Kit Pro, RTP0RC77965SKBX010SA00 (M3-N ES1.1)) + compatible = "renesas,m3nulcb", "renesas,r8a77965" - Marzen (R0P7779A00010S) compatible = "renesas,marzen", "renesas,r8a7779" - Porter (M2-LCDP) @@ -143,12 +151,12 @@ Boards: compatible = "renesas,wheat", "renesas,r8a7792" -Most Renesas ARM SoCs have a Product Register that allows to retrieve SoC -product and revision information. If present, a device node for this register -should be added. +Most Renesas ARM SoCs have a Product Register or Boundary Scan ID Register that +allows to retrieve SoC product and revision information. If present, a device +node for this register should be added. Required properties: - - compatible: Must be "renesas,prr". + - compatible: Must be "renesas,prr" or "renesas,bsid" - reg: Base address and length of the register block. diff --git a/Documentation/devicetree/bindings/arm/marvell/marvell,berlin.txt b/Documentation/devicetree/bindings/arm/syna.txt similarity index 89% rename from Documentation/devicetree/bindings/arm/marvell/marvell,berlin.txt rename to Documentation/devicetree/bindings/arm/syna.txt index 3bab18409b7a..2face46a5f64 100644 --- a/Documentation/devicetree/bindings/arm/marvell/marvell,berlin.txt +++ b/Documentation/devicetree/bindings/arm/syna.txt @@ -1,4 +1,9 @@ -Marvell Berlin SoC Family Device Tree Bindings +Synaptics SoC Device Tree Bindings + +According to https://www.synaptics.com/company/news/conexant-marvell +Synaptics has acquired the Multimedia Solutions Business of Marvell, so +berlin SoCs are now Synaptics' SoCs now. + --------------------------------------------------------------- Work in progress statement: @@ -13,6 +18,10 @@ stable binding/ABI. --------------------------------------------------------------- +Boards with the Synaptics AS370 SoC shall have the following properties: + Required root node property: + compatible: "syna,as370" + Boards with a SoC of the Marvell Berlin family, e.g. Armada 1500 shall have the following properties: diff --git a/Documentation/devicetree/bindings/arm/tegra.txt b/Documentation/devicetree/bindings/arm/tegra.txt index 32f62bb7006d..c59b15f64346 100644 --- a/Documentation/devicetree/bindings/arm/tegra.txt +++ b/Documentation/devicetree/bindings/arm/tegra.txt @@ -47,12 +47,17 @@ board-specific compatible values: nvidia,ventana toradex,apalis_t30 toradex,apalis_t30-eval + toradex,apalis_t30-v1.1 + toradex,apalis_t30-v1.1-eval toradex,apalis-tk1 toradex,apalis-tk1-eval - toradex,colibri_t20-512 + toradex,apalis-tk1-v1.2 + toradex,apalis-tk1-v1.2-eval + toradex,colibri_t20 + toradex,colibri_t20-eval-v3 + toradex,colibri_t20-iris toradex,colibri_t30 toradex,colibri_t30-eval-v3 - toradex,iris Trusted Foundations ------------------------------------------- diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt index 5a3bf7c5a7a0..c9fd6d1de57e 100644 --- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt +++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt @@ -34,3 +34,96 @@ Board DTS: pmc@c360000 { nvidia,invert-interrupt; }; + +== Pad Control == + +On Tegra SoCs a pad is a set of pins which are configured as a group. +The pin grouping is a fixed attribute of the hardware. The PMC can be +used to set pad power state and signaling voltage. A pad can be either +in active or power down mode. The support for power state and signaling +voltage configuration varies depending on the pad in question. 3.3 V and +1.8 V signaling voltages are supported on pins where software +controllable signaling voltage switching is available. + +Pad configurations are described with pin configuration nodes which +are placed under the pmc node and they are referred to by the pinctrl +client properties. For more information see +Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt. + +The following pads are present on Tegra186: +csia csib dsi mipi-bias +pex-clk-bias pex-clk3 pex-clk2 pex-clk1 +usb0 usb1 usb2 usb-bias +uart audio hsic dbg +hdmi-dp0 hdmi-dp1 pex-cntrl sdmmc2-hv +sdmmc4 cam dsib dsic +dsid csic csid csie +dsif spi ufs dmic-hv +edp sdmmc1-hv sdmmc3-hv conn +audio-hv ao-hv + +Required pin configuration properties: + - pins: A list of strings, each of which contains the name of a pad + to be configured. + +Optional pin configuration properties: + - low-power-enable: Configure the pad into power down mode + - low-power-disable: Configure the pad into active mode + - power-source: Must contain either TEGRA_IO_PAD_VOLTAGE_1V8 or + TEGRA_IO_PAD_VOLTAGE_3V3 to select between signaling voltages. + The values are defined in + include/dt-bindings/pinctrl/pinctrl-tegra-io-pad.h. + +Note: The power state can be configured on all of the above pads except + for ao-hv. Following pads have software configurable signaling + voltages: sdmmc2-hv, dmic-hv, sdmmc1-hv, sdmmc3-hv, audio-hv, + ao-hv. + +Pad configuration state example: + pmc: pmc@7000e400 { + compatible = "nvidia,tegra186-pmc"; + reg = <0 0x0c360000 0 0x10000>, + <0 0x0c370000 0 0x10000>, + <0 0x0c380000 0 0x10000>, + <0 0x0c390000 0 0x10000>; + reg-names = "pmc", "wake", "aotag", "scratch"; + + ... + + sdmmc1_3v3: sdmmc1-3v3 { + pins = "sdmmc1-hv"; + power-source = ; + }; + + sdmmc1_1v8: sdmmc1-1v8 { + pins = "sdmmc1-hv"; + power-source = ; + }; + + hdmi_off: hdmi-off { + pins = "hdmi"; + low-power-enable; + } + + hdmi_on: hdmi-on { + pins = "hdmi"; + low-power-disable; + } + }; + +Pinctrl client example: + sdmmc1: sdhci@3400000 { + ... + pinctrl-names = "sdmmc-3v3", "sdmmc-1v8"; + pinctrl-0 = <&sdmmc1_3v3>; + pinctrl-1 = <&sdmmc1_1v8>; + }; + + ... + + sor0: sor@15540000 { + ... + pinctrl-0 = <&hdmi_off>; + pinctrl-1 = <&hdmi_on>; + pinctrl-names = "hdmi-on", "hdmi-off"; + }; diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.txt b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.txt index a74b37b07e5c..cb12f33a247f 100644 --- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.txt +++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.txt @@ -195,3 +195,106 @@ Example: power-domains = <&pd_audio>; ... }; + +== Pad Control == + +On Tegra SoCs a pad is a set of pins which are configured as a group. +The pin grouping is a fixed attribute of the hardware. The PMC can be +used to set pad power state and signaling voltage. A pad can be either +in active or power down mode. The support for power state and signaling +voltage configuration varies depending on the pad in question. 3.3 V and +1.8 V signaling voltages are supported on pins where software +controllable signaling voltage switching is available. + +The pad configuration state nodes are placed under the pmc node and they +are referred to by the pinctrl client properties. For more information +see Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt. +The pad name should be used as the value of the pins property in pin +configuration nodes. + +The following pads are present on Tegra124 and Tegra132: +audio bb cam comp +csia csb cse dsi +dsib dsic dsid hdmi +hsic hv lvds mipi-bias +nand pex-bias pex-clk1 pex-clk2 +pex-cntrl sdmmc1 sdmmc3 sdmmc4 +sys_ddc uart usb0 usb1 +usb2 usb_bias + +The following pads are present on Tegra210: +audio audio-hv cam csia +csib csic csid csie +csif dbg debug-nonao dmic +dp dsi dsib dsic +dsid emmc emmc2 gpio +hdmi hsic lvds mipi-bias +pex-bias pex-clk1 pex-clk2 pex-cntrl +sdmmc1 sdmmc3 spi spi-hv +uart usb0 usb1 usb2 +usb3 usb-bias + +Required pin configuration properties: + - pins: Must contain name of the pad(s) to be configured. + +Optional pin configuration properties: + - low-power-enable: Configure the pad into power down mode + - low-power-disable: Configure the pad into active mode + - power-source: Must contain either TEGRA_IO_PAD_VOLTAGE_1V8 + or TEGRA_IO_PAD_VOLTAGE_3V3 to select between signaling voltages. + The values are defined in + include/dt-bindings/pinctrl/pinctrl-tegra-io-pad.h. + +Note: The power state can be configured on all of the Tegra124 and + Tegra132 pads. None of the Tegra124 or Tegra132 pads support + signaling voltage switching. + +Note: All of the listed Tegra210 pads except pex-cntrl support power + state configuration. Signaling voltage switching is supported on + following Tegra210 pads: audio, audio-hv, cam, dbg, dmic, gpio, + pex-cntrl, sdmmc1, sdmmc3, spi, spi-hv, and uart. + +Pad configuration state example: + pmc: pmc@7000e400 { + compatible = "nvidia,tegra210-pmc"; + reg = <0x0 0x7000e400 0x0 0x400>; + clocks = <&tegra_car TEGRA210_CLK_PCLK>, <&clk32k_in>; + clock-names = "pclk", "clk32k_in"; + + ... + + sdmmc1_3v3: sdmmc1-3v3 { + pins = "sdmmc1"; + power-source = ; + }; + + sdmmc1_1v8: sdmmc1-1v8 { + pins = "sdmmc1"; + power-source = ; + }; + + hdmi_off: hdmi-off { + pins = "hdmi"; + low-power-enable; + } + + hdmi_on: hdmi-on { + pins = "hdmi"; + low-power-disable; + } + }; + +Pinctrl client example: + sdmmc1: sdhci@700b0000 { + ... + pinctrl-names = "sdmmc-3v3", "sdmmc-1v8"; + pinctrl-0 = <&sdmmc1_3v3>; + pinctrl-1 = <&sdmmc1_1v8>; + }; + ... + sor@54540000 { + ... + pinctrl-0 = <&hdmi_off>; + pinctrl-1 = <&hdmi_on>; + pinctrl-names = "hdmi-on", "hdmi-off"; + }; diff --git a/Documentation/devicetree/bindings/arm/ux500/boards.txt b/Documentation/devicetree/bindings/arm/ux500/boards.txt index 0fa429534f49..89408de55bfd 100644 --- a/Documentation/devicetree/bindings/arm/ux500/boards.txt +++ b/Documentation/devicetree/bindings/arm/ux500/boards.txt @@ -60,7 +60,7 @@ Example: <0xa0410100 0x100>; }; - scu@a04100000 { + scu@a0410000 { compatible = "arm,cortex-a9-scu"; reg = <0xa0410000 0x100>; }; diff --git a/Documentation/devicetree/bindings/arm/zte,sysctrl.txt b/Documentation/devicetree/bindings/arm/zte,sysctrl.txt new file mode 100644 index 000000000000..7e66b7f7ba96 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/zte,sysctrl.txt @@ -0,0 +1,30 @@ +ZTE sysctrl Registers + +Registers for 'zte,zx296702' SoC: + +System management required properties: + - compatible = "zte,sysctrl" + +Low power management required properties: + - compatible = "zte,zx296702-pcu" + +Bus matrix required properties: + - compatible = "zte,zx-bus-matrix" + + +Registers for 'zte,zx296718' SoC: + +System management required properties: + - compatible = "zte,zx296718-aon-sysctrl" + - compatible = "zte,zx296718-sysctrl" + +Example: +aon_sysctrl: aon-sysctrl@116000 { + compatible = "zte,zx296718-aon-sysctrl", "syscon"; + reg = <0x116000 0x1000>; +}; + +sysctrl: sysctrl@1463000 { + compatible = "zte,zx296718-sysctrl", "syscon"; + reg = <0x1463000 0x1000>; +}; diff --git a/Documentation/devicetree/bindings/arm/zte.txt b/Documentation/devicetree/bindings/arm/zte.txt index 83369785d29c..340612794a37 100644 --- a/Documentation/devicetree/bindings/arm/zte.txt +++ b/Documentation/devicetree/bindings/arm/zte.txt @@ -1,20 +1,10 @@ ZTE platforms device tree bindings ---------------------------------------- +--------------------------------------- - ZX296702 board: Required root node properties: - compatible = "zte,zx296702-ad1", "zte,zx296702" -System management required properties: - - compatible = "zte,sysctrl" - -Low power management required properties: - - compatible = "zte,zx296702-pcu" - -Bus matrix required properties: - - compatible = "zte,zx-bus-matrix" - - --------------------------------------- - ZX296718 SoC: Required root node properties: @@ -22,18 +12,3 @@ Bus matrix required properties: ZX296718 EVB board: - "zte,zx296718-evb" - -System management required properties: - - compatible = "zte,zx296718-aon-sysctrl" - - compatible = "zte,zx296718-sysctrl" - -Example: -aon_sysctrl: aon-sysctrl@116000 { - compatible = "zte,zx296718-aon-sysctrl", "syscon"; - reg = <0x116000 0x1000>; -}; - -sysctrl: sysctrl@1463000 { - compatible = "zte,zx296718-sysctrl", "syscon"; - reg = <0x1463000 0x1000>; -}; diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.txt b/Documentation/devicetree/bindings/ata/ahci-platform.txt index 5d5bd456d9d9..e30fd106df4f 100644 --- a/Documentation/devicetree/bindings/ata/ahci-platform.txt +++ b/Documentation/devicetree/bindings/ata/ahci-platform.txt @@ -10,6 +10,7 @@ PHYs. Required properties: - compatible : compatible string, one of: - "allwinner,sun4i-a10-ahci" + - "allwinner,sun8i-r40-ahci" - "brcm,iproc-ahci" - "hisilicon,hisi-ahci" - "cavium,octeon-7130-ahci" @@ -31,8 +32,10 @@ Optional properties: - clocks : a list of phandle + clock specifier pairs - resets : a list of phandle + reset specifier pairs - target-supply : regulator for SATA target power +- phy-supply : regulator for PHY power - phys : reference to the SATA PHY node - phy-names : must be "sata-phy" +- ahci-supply : regulator for AHCI controller - ports-implemented : Mask that indicates which ports that the HBA supports are available for software to use. Useful if PORTS_IMPL is not programmed by the BIOS, which is true with @@ -42,12 +45,13 @@ Required properties when using sub-nodes: - #address-cells : number of cells to encode an address - #size-cells : number of cells representing the size of an address +For allwinner,sun8i-r40-ahci, the reset propertie must be present. Sub-nodes required properties: - reg : the port number And at least one of the following properties: - phys : reference to the SATA PHY node -- target-supply : regulator for SATA target power +- target-supply : regulator for SATA target power Examples: sata@ffe08000 { diff --git a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt index 0a5b3b47f217..7713a413c6a7 100644 --- a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt +++ b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt @@ -9,6 +9,7 @@ Required properties: "brcm,bcm7445-ahci" "brcm,bcm-nsp-ahci" "brcm,sata3-ahci" + "brcm,bcm63138-ahci" - reg : register mappings for AHCI and SATA_TOP_CTRL - reg-names : "ahci" and "top-ctrl" - interrupts : interrupt mapping for SATA IRQ diff --git a/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt b/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt index d1e60d297387..2ef86ae96df8 100644 --- a/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt +++ b/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt @@ -13,6 +13,7 @@ Required Properties: region. - clocks: Reference to the parent clocks ("hosc", "losc") - #clock-cells: should be 1. +- #reset-cells: should be 1. Each clock is assigned an identifier, and client nodes can use this identifier to specify the clock which they consume. @@ -36,6 +37,7 @@ Example: Clock Management Unit node: reg = <0x0 0xe0160000 0x0 0x1000>; clocks = <&hosc>, <&losc>; #clock-cells = <1>; + #reset-cells = <1>; }; Example: UART controller node that consumes clock generated by the clock diff --git a/Documentation/devicetree/bindings/clock/at91-clock.txt b/Documentation/devicetree/bindings/clock/at91-clock.txt index 8f8f95056f3d..e9f70fcdfe80 100644 --- a/Documentation/devicetree/bindings/clock/at91-clock.txt +++ b/Documentation/devicetree/bindings/clock/at91-clock.txt @@ -4,6 +4,8 @@ This binding uses the common clock binding[1]. [1] Documentation/devicetree/bindings/clock/clock-bindings.txt +Slow Clock controller: + Required properties: - compatible : shall be one of the following: "atmel,at91sam9x5-sckc" or @@ -16,84 +18,6 @@ Required properties: "atmel,at91sam9x5-clk-slow-rc-osc": at91 internal slow RC oscillator - - "atmel,-pmc": - at91 PMC (Power Management Controller) - All at91 specific clocks (clocks defined below) must be child - node of the PMC node. - can be: at91rm9200, at91sam9260, at91sam9261, - at91sam9263, at91sam9g45, at91sam9n12, at91sam9rl, at91sam9x5, - sama5d2, sama5d3 or sama5d4. - - "atmel,at91sam9x5-clk-slow" (under sckc node) - or - "atmel,at91sam9260-clk-slow" (under pmc node): - at91 slow clk - - "atmel,at91rm9200-clk-main-osc" - "atmel,at91sam9x5-clk-main-rc-osc" - at91 main clk sources - - "atmel,at91sam9x5-clk-main" - "atmel,at91rm9200-clk-main": - at91 main clock - - "atmel,at91rm9200-clk-master" or - "atmel,at91sam9x5-clk-master": - at91 master clock - - "atmel,at91sam9x5-clk-peripheral" or - "atmel,at91rm9200-clk-peripheral": - at91 peripheral clocks - - "atmel,at91rm9200-clk-pll" or - "atmel,at91sam9g45-clk-pll" or - "atmel,at91sam9g20-clk-pllb" or - "atmel,sama5d3-clk-pll": - at91 pll clocks - - "atmel,at91sam9x5-clk-plldiv": - at91 plla divisor - - "atmel,at91rm9200-clk-programmable" or - "atmel,at91sam9g45-clk-programmable" or - "atmel,at91sam9x5-clk-programmable": - at91 programmable clocks - - "atmel,at91sam9x5-clk-smd": - at91 SMD (Soft Modem) clock - - "atmel,at91rm9200-clk-system": - at91 system clocks - - "atmel,at91rm9200-clk-usb" or - "atmel,at91sam9x5-clk-usb" or - "atmel,at91sam9n12-clk-usb": - at91 usb clock - - "atmel,at91sam9x5-clk-utmi": - at91 utmi clock - - "atmel,sama5d4-clk-h32mx": - at91 h32mx clock - - "atmel,sama5d2-clk-generated": - at91 generated clock - - "atmel,sama5d2-clk-audio-pll-frac": - at91 audio fractional pll - - "atmel,sama5d2-clk-audio-pll-pad": - at91 audio pll CLK_AUDIO output pin - - "atmel,sama5d2-clk-audio-pll-pmc" - at91 audio pll output on AUDIOPLLCLK that feeds the PMC - and can be used by peripheral clock or generic clock - - "atmel,sama5d2-clk-i2s-mux" (under pmc node): - at91 I2S clock source selection - -Required properties for SCKC node: - reg : defines the IO memory reserved for the SCKC. - #size-cells : shall be 0 (reg is used to encode clk id). - #address-cells : shall be 1 (reg is used to encode clk id). @@ -109,428 +33,30 @@ For example: /* put at91 slow clocks here */ }; +Power Management Controller (PMC): -Required properties for internal slow RC oscillator: -- #clock-cells : from common clock binding; shall be set to 0. -- clock-frequency : define the internal RC oscillator frequency. - -Optional properties: -- clock-accuracy : define the internal RC oscillator accuracy. - -For example: - slow_rc_osc: slow_rc_osc { - compatible = "atmel,at91sam9x5-clk-slow-rc-osc"; - clock-frequency = <32768>; - clock-accuracy = <50000000>; - }; - -Required properties for slow oscillator: -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall encode the main osc source clk sources (see atmel datasheet). +Required properties: +- compatible : shall be "atmel,-pmc", "syscon": + can be: at91rm9200, at91sam9260, at91sam9261, + at91sam9263, at91sam9g45, at91sam9n12, at91sam9rl, at91sam9g15, + at91sam9g25, at91sam9g35, at91sam9x25, at91sam9x35, at91sam9x5, + sama5d2, sama5d3 or sama5d4. +- #clock-cells : from common clock binding; shall be set to 2. The first entry + is the type of the clock (core, system, peripheral or generated) and the + second entry its index as provided by the datasheet +- clocks : Must contain an entry for each entry in clock-names. +- clock-names: Must include the following entries: "slow_clk", "main_xtal" Optional properties: - atmel,osc-bypass : boolean property. Set this when a clock signal is directly provided on XIN. For example: - slow_osc: slow_osc { - compatible = "atmel,at91rm9200-clk-slow-osc"; - #clock-cells = <0>; - clocks = <&slow_xtal>; - }; - -Required properties for slow clock: -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall encode the slow clk sources (see atmel datasheet). - -For example: - clk32k: slck { - compatible = "atmel,at91sam9x5-clk-slow"; - #clock-cells = <0>; - clocks = <&slow_rc_osc &slow_osc>; - }; - -Required properties for PMC node: -- reg : defines the IO memory reserved for the PMC. -- #size-cells : shall be 0 (reg is used to encode clk id). -- #address-cells : shall be 1 (reg is used to encode clk id). -- interrupts : shall be set to PMC interrupt line. -- interrupt-controller : tell that the PMC is an interrupt controller. -- #interrupt-cells : must be set to 1. The first cell encodes the interrupt id, - and reflect the bit position in the PMC_ER/DR/SR registers. - You can use the dt macros defined in dt-bindings/clock/at91.h. - 0 (AT91_PMC_MOSCS) -> main oscillator ready - 1 (AT91_PMC_LOCKA) -> PLL A ready - 2 (AT91_PMC_LOCKB) -> PLL B ready - 3 (AT91_PMC_MCKRDY) -> master clock ready - 6 (AT91_PMC_LOCKU) -> UTMI PLL clock ready - 8 .. 15 (AT91_PMC_PCKRDY(id)) -> programmable clock ready - 16 (AT91_PMC_MOSCSELS) -> main oscillator selected - 17 (AT91_PMC_MOSCRCS) -> RC main oscillator stabilized - 18 (AT91_PMC_CFDEV) -> clock failure detected - -For example: - pmc: pmc@fffffc00 { - compatible = "atmel,sama5d3-pmc"; - interrupts = <1 4 7>; - interrupt-controller; - #interrupt-cells = <2>; - #size-cells = <0>; - #address-cells = <1>; - - /* put at91 clocks here */ - }; - -Required properties for main clock internal RC oscillator: -- interrupts : shall be set to "<0>". -- clock-frequency : define the internal RC oscillator frequency. - -Optional properties: -- clock-accuracy : define the internal RC oscillator accuracy. - -For example: - main_rc_osc: main_rc_osc { - compatible = "atmel,at91sam9x5-clk-main-rc-osc"; - interrupt-parent = <&pmc>; - interrupts = <0>; - clock-frequency = <12000000>; - clock-accuracy = <50000000>; - }; - -Required properties for main clock oscillator: -- interrupts : shall be set to "<0>". -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall encode the main osc source clk sources (see atmel datasheet). - -Optional properties: -- atmel,osc-bypass : boolean property. Specified if a clock signal is provided - on XIN. - - clock signal is directly provided on XIN pin. - -For example: - main_osc: main_osc { - compatible = "atmel,at91rm9200-clk-main-osc"; - interrupt-parent = <&pmc>; - interrupts = <0>; - #clock-cells = <0>; - clocks = <&main_xtal>; - }; - -Required properties for main clock: -- interrupts : shall be set to "<0>". -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall encode the main clk sources (see atmel datasheet). - -For example: - main: mainck { - compatible = "atmel,at91sam9x5-clk-main"; - interrupt-parent = <&pmc>; - interrupts = <0>; - #clock-cells = <0>; - clocks = <&main_rc_osc &main_osc>; - }; - -Required properties for master clock: -- interrupts : shall be set to "<3>". -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the master clock sources (see atmel datasheet) phandles. - e.g. "<&ck32k>, <&main>, <&plla>, <&pllb>". -- atmel,clk-output-range : minimum and maximum clock frequency (two u32 - fields). - e.g. output = <0 133000000>; <=> 0 to 133MHz. -- atmel,clk-divisors : master clock divisors table (four u32 fields). - 0 <=> reserved value. - e.g. divisors = <1 2 4 6>; -- atmel,master-clk-have-div3-pres : some SoC use the reserved value 7 in the - PRES field as CLOCK_DIV3 (e.g sam9x5). - -For example: - mck: mck { - compatible = "atmel,at91rm9200-clk-master"; - interrupt-parent = <&pmc>; - interrupts = <3>; - #clock-cells = <0>; - atmel,clk-output-range = <0 133000000>; - atmel,clk-divisors = <1 2 4 0>; - }; - -Required properties for peripheral clocks: -- #size-cells : shall be 0 (reg is used to encode clk id). -- #address-cells : shall be 1 (reg is used to encode clk id). -- clocks : shall be the master clock phandle. - e.g. clocks = <&mck>; -- name: device tree node describing a specific peripheral clock. - * #clock-cells : from common clock binding; shall be set to 0. - * reg: peripheral id. See Atmel's datasheets to get a full - list of peripheral ids. - * atmel,clk-output-range : minimum and maximum clock frequency - (two u32 fields). Only valid on at91sam9x5-clk-peripheral - compatible IPs. - -For example: - periph: periphck { - compatible = "atmel,at91sam9x5-clk-peripheral"; - #size-cells = <0>; - #address-cells = <1>; - clocks = <&mck>; - - ssc0_clk { - #clock-cells = <0>; - reg = <2>; - atmel,clk-output-range = <0 133000000>; - }; - - usart0_clk { - #clock-cells = <0>; - reg = <3>; - atmel,clk-output-range = <0 66000000>; - }; - }; - - -Required properties for pll clocks: -- interrupts : shall be set to "<1>". -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the main clock phandle. -- reg : pll id. - 0 -> PLL A - 1 -> PLL B -- atmel,clk-input-range : minimum and maximum source clock frequency (two u32 - fields). - e.g. input = <1 32000000>; <=> 1 to 32MHz. -- #atmel,pll-clk-output-range-cells : number of cells reserved for pll output - range description. Sould be set to 2, 3 - or 4. - * 1st and 2nd cells represent the frequency range (min-max). - * 3rd cell is optional and represents the OUT field value for the given - range. - * 4th cell is optional and represents the ICPLL field (PLLICPR - register) -- atmel,pll-clk-output-ranges : pll output frequency ranges + optional parameter - depending on #atmel,pll-output-range-cells - property value. - -For example: - plla: pllack { - compatible = "atmel,at91sam9g45-clk-pll"; - interrupt-parent = <&pmc>; - interrupts = <1>; - #clock-cells = <0>; - clocks = <&main>; - reg = <0>; - atmel,clk-input-range = <2000000 32000000>; - #atmel,pll-clk-output-range-cells = <4>; - atmel,pll-clk-output-ranges = <74500000 800000000 0 0 - 69500000 750000000 1 0 - 64500000 700000000 2 0 - 59500000 650000000 3 0 - 54500000 600000000 0 1 - 49500000 550000000 1 1 - 44500000 500000000 2 1 - 40000000 450000000 3 1>; - }; - -Required properties for plldiv clocks (plldiv = pll / 2): -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the plla clock phandle. - -The pll divisor is equal to 2 and cannot be changed. - -For example: - plladiv: plladivck { - compatible = "atmel,at91sam9x5-clk-plldiv"; - #clock-cells = <0>; - clocks = <&plla>; - }; - -Required properties for programmable clocks: -- #size-cells : shall be 0 (reg is used to encode clk id). -- #address-cells : shall be 1 (reg is used to encode clk id). -- clocks : shall be the programmable clock source phandles. - e.g. clocks = <&clk32k>, <&main>, <&plla>, <&pllb>; -- name: device tree node describing a specific prog clock. - * #clock-cells : from common clock binding; shall be set to 0. - * reg : programmable clock id (register offset from PCKx - register). - * interrupts : shall be set to "<(8 + id)>". - -For example: - prog: progck { - compatible = "atmel,at91sam9g45-clk-programmable"; - #size-cells = <0>; - #address-cells = <1>; - interrupt-parent = <&pmc>; - clocks = <&clk32k>, <&main>, <&plladiv>, <&utmi>, <&mck>; - - prog0 { - #clock-cells = <0>; - reg = <0>; - interrupts = <8>; - }; - - prog1 { - #clock-cells = <0>; - reg = <1>; - interrupts = <9>; - }; - }; - - -Required properties for smd clock: -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the smd clock source phandles. - e.g. clocks = <&plladiv>, <&utmi>; - -For example: - smd: smdck { - compatible = "atmel,at91sam9x5-clk-smd"; - #clock-cells = <0>; - clocks = <&plladiv>, <&utmi>; - }; - -Required properties for system clocks: -- #size-cells : shall be 0 (reg is used to encode clk id). -- #address-cells : shall be 1 (reg is used to encode clk id). -- name: device tree node describing a specific system clock. - * #clock-cells : from common clock binding; shall be set to 0. - * reg: system clock id (bit position in SCER/SCDR/SCSR registers). - See Atmel's datasheet to get a full list of system clock ids. - -For example: - system: systemck { - compatible = "atmel,at91rm9200-clk-system"; - #address-cells = <1>; - #size-cells = <0>; - - ddrck { - #clock-cells = <0>; - reg = <2>; - clocks = <&mck>; - }; - - uhpck { - #clock-cells = <0>; - reg = <6>; - clocks = <&usb>; - }; - - udpck { - #clock-cells = <0>; - reg = <7>; - clocks = <&usb>; - }; - }; - - -Required properties for usb clock: -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the smd clock source phandles. - e.g. clocks = <&pllb>; -- atmel,clk-divisors (only available for "atmel,at91rm9200-clk-usb"): - usb clock divisor table. - e.g. divisors = <1 2 4 0>; - -For example: - usb: usbck { - compatible = "atmel,at91sam9x5-clk-usb"; - #clock-cells = <0>; - clocks = <&plladiv>, <&utmi>; - }; - - usb: usbck { - compatible = "atmel,at91rm9200-clk-usb"; - #clock-cells = <0>; - clocks = <&pllb>; - atmel,clk-divisors = <1 2 4 0>; - }; - - -Required properties for utmi clock: -- interrupts : shall be set to "". -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the main clock source phandle. - -For example: - utmi: utmick { - compatible = "atmel,at91sam9x5-clk-utmi"; - interrupt-parent = <&pmc>; - interrupts = ; - #clock-cells = <0>; - clocks = <&main>; - }; - -Required properties for 32 bits bus Matrix clock (h32mx clock): -- #clock-cells : from common clock binding; shall be set to 0. -- clocks : shall be the master clock source phandle. - -For example: - h32ck: h32mxck { - #clock-cells = <0>; - compatible = "atmel,sama5d4-clk-h32mx"; - clocks = <&mck>; - }; - -Required properties for generated clocks: -- #size-cells : shall be 0 (reg is used to encode clk id). -- #address-cells : shall be 1 (reg is used to encode clk id). -- clocks : shall be the generated clock source phandles. - e.g. clocks = <&clk32k>, <&main>, <&plladiv>, <&utmi>, <&mck>, <&audio_pll_pmc>; -- name: device tree node describing a specific generated clock. - * #clock-cells : from common clock binding; shall be set to 0. - * reg: peripheral id. See Atmel's datasheets to get a full - list of peripheral ids. - * atmel,clk-output-range : minimum and maximum clock frequency - (two u32 fields). - -For example: - gck { - compatible = "atmel,sama5d2-clk-generated"; - #address-cells = <1>; - #size-cells = <0>; - clocks = <&clk32k>, <&main>, <&plladiv>, <&utmi>, <&mck>, <&audio_pll_pmc>; - - tcb0_gclk: tcb0_gclk { - #clock-cells = <0>; - reg = <35>; - atmel,clk-output-range = <0 83000000>; - }; - - pwm_gclk: pwm_gclk { - #clock-cells = <0>; - reg = <38>; - atmel,clk-output-range = <0 83000000>; - }; - }; - -Required properties for I2S mux clocks: -- #size-cells : shall be 0 (reg is used to encode I2S bus id). -- #address-cells : shall be 1 (reg is used to encode I2S bus id). -- name: device tree node describing a specific mux clock. - * #clock-cells : from common clock binding; shall be set to 0. - * clocks : shall be the mux clock parent phandles; shall be 2 phandles: - peripheral and generated clock; the first phandle shall belong to the - peripheral clock and the second one shall belong to the generated - clock; "clock-indices" property can be user to specify - the correct order. - * reg: I2S bus id of the corresponding mux clock. - e.g. reg = <0>; for i2s0, reg = <1>; for i2s1 - -For example: - i2s_clkmux { - compatible = "atmel,sama5d2-clk-i2s-mux"; - #address-cells = <1>; - #size-cells = <0>; - - i2s0muxck: i2s0_muxclk { - clocks = <&i2s0_clk>, <&i2s0_gclk>; - #clock-cells = <0>; - reg = <0>; - }; - - i2s1muxck: i2s1_muxclk { - clocks = <&i2s1_clk>, <&i2s1_gclk>; - #clock-cells = <0>; - reg = <1>; - }; + pmc: pmc@f0018000 { + compatible = "atmel,sama5d4-pmc", "syscon"; + reg = <0xf0018000 0x120>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; + #clock-cells = <2>; + clocks = <&clk32k>, <&main_xtal>; + clock-names = "slow_clk", "main_xtal"; }; diff --git a/Documentation/devicetree/bindings/clock/hi3670-clock.txt b/Documentation/devicetree/bindings/clock/hi3670-clock.txt new file mode 100644 index 000000000000..66f3697eca78 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/hi3670-clock.txt @@ -0,0 +1,43 @@ +* Hisilicon Hi3670 Clock Controller + +The Hi3670 clock controller generates and supplies clock to various +controllers within the Hi3670 SoC. + +Required Properties: + +- compatible: the compatible should be one of the following strings to + indicate the clock controller functionality. + + - "hisilicon,hi3670-crgctrl" + - "hisilicon,hi3670-pctrl" + - "hisilicon,hi3670-pmuctrl" + - "hisilicon,hi3670-sctrl" + - "hisilicon,hi3670-iomcu" + - "hisilicon,hi3670-media1-crg" + - "hisilicon,hi3670-media2-crg" + +- reg: physical base address of the controller and length of memory mapped + region. + +- #clock-cells: should be 1. + +Each clock is assigned an identifier and client nodes use this identifier +to specify the clock which they consume. + +All these identifier could be found in . + +Examples: + crg_ctrl: clock-controller@fff35000 { + compatible = "hisilicon,hi3670-crgctrl", "syscon"; + reg = <0x0 0xfff35000 0x0 0x1000>; + #clock-cells = <1>; + }; + + uart0: serial@fdf02000 { + compatible = "arm,pl011", "arm,primecell"; + reg = <0x0 0xfdf02000 0x0 0x1000>; + interrupts = ; + clocks = <&crg_ctrl HI3670_CLK_GATE_UART0>, + <&crg_ctrl HI3670_PCLK>; + clock-names = "uartclk", "apb_pclk"; + }; diff --git a/Documentation/devicetree/bindings/clock/imx6q-clock.txt b/Documentation/devicetree/bindings/clock/imx6q-clock.txt index a45ca67a9d5f..e1308346e00d 100644 --- a/Documentation/devicetree/bindings/clock/imx6q-clock.txt +++ b/Documentation/devicetree/bindings/clock/imx6q-clock.txt @@ -6,6 +6,14 @@ Required properties: - interrupts: Should contain CCM interrupt - #clock-cells: Should be <1> +Optional properties: +- fsl,pmic-stby-poweroff: Configure CCM to assert PMIC_STBY_REQ signal + on power off. + Use this property if the SoC should be powered off by external power + management IC (PMIC) triggered via PMIC_STBY_REQ signal. + Boards that are designed to initiate poweroff on PMIC_ON_REQ signal should + be using "syscon-poweroff" driver instead. + The clock consumer should specify the desired clock by having the clock ID in its "clocks" phandle cell. See include/dt-bindings/clock/imx6qdl-clock.h for the full list of i.MX6 Quad and DualLite clock IDs. diff --git a/Documentation/devicetree/bindings/clock/ingenic,cgu.txt b/Documentation/devicetree/bindings/clock/ingenic,cgu.txt index f8d4134ae409..ba5a442026b7 100644 --- a/Documentation/devicetree/bindings/clock/ingenic,cgu.txt +++ b/Documentation/devicetree/bindings/clock/ingenic,cgu.txt @@ -6,8 +6,11 @@ to provide many different clock signals derived from only 2 external source clocks. Required properties: -- compatible : Should be "ingenic,-cgu". - For example "ingenic,jz4740-cgu" or "ingenic,jz4780-cgu". +- compatible : Should be one of: + * ingenic,jz4740-cgu + * ingenic,jz4725b-cgu + * ingenic,jz4770-cgu + * ingenic,jz4780-cgu - reg : The address & length of the CGU registers. - clocks : List of phandle & clock specifiers for clocks external to the CGU. Two such external clocks should be specified - first the external crystal diff --git a/Documentation/devicetree/bindings/clock/qcom,camcc.txt b/Documentation/devicetree/bindings/clock/qcom,camcc.txt new file mode 100644 index 000000000000..c5eb6694fda9 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,camcc.txt @@ -0,0 +1,18 @@ +Qualcomm Camera Clock & Reset Controller Binding +------------------------------------------------ + +Required properties : +- compatible : shall contain "qcom,sdm845-camcc". +- reg : shall contain base register location and length. +- #clock-cells : from common clock binding, shall contain 1. +- #reset-cells : from common reset binding, shall contain 1. +- #power-domain-cells : from generic power domain binding, shall contain 1. + +Example: + camcc: clock-controller@ad00000 { + compatible = "qcom,sdm845-camcc"; + reg = <0xad00000 0x10000>; + #clock-cells = <1>; + #reset-cells = <1>; + #power-domain-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.txt b/Documentation/devicetree/bindings/clock/qcom,gcc.txt index 664ea1fd6c76..52d9345c9927 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt +++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt @@ -19,6 +19,9 @@ Required properties : "qcom,gcc-msm8996" "qcom,gcc-msm8998" "qcom,gcc-mdm9615" + "qcom,gcc-qcs404" + "qcom,gcc-sdm630" + "qcom,gcc-sdm660" "qcom,gcc-sdm845" - reg : shall contain base register location and length diff --git a/Documentation/devicetree/bindings/clock/qcom,hfpll.txt b/Documentation/devicetree/bindings/clock/qcom,hfpll.txt new file mode 100644 index 000000000000..ec02a024424c --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,hfpll.txt @@ -0,0 +1,60 @@ +High-Frequency PLL (HFPLL) + +PROPERTIES + +- compatible: + Usage: required + Value type: : + shall contain only one of the following. The generic + compatible "qcom,hfpll" should be also included. + + "qcom,hfpll-ipq8064", "qcom,hfpll" + "qcom,hfpll-apq8064", "qcom,hfpll" + "qcom,hfpll-msm8974", "qcom,hfpll" + "qcom,hfpll-msm8960", "qcom,hfpll" + +- reg: + Usage: required + Value type: + Definition: address and size of HPLL registers. An optional second + element specifies the address and size of the alias + register region. + +- clocks: + Usage: required + Value type: + Definition: reference to the xo clock. + +- clock-names: + Usage: required + Value type: + Definition: must be "xo". + +- clock-output-names: + Usage: required + Value type: + Definition: Name of the PLL. Typically hfpllX where X is a CPU number + starting at 0. Otherwise hfpll_Y where Y is more specific + such as "l2". + +Example: + +1) An HFPLL for the L2 cache. + + clock-controller@f9016000 { + compatible = "qcom,hfpll-ipq8064", "qcom,hfpll"; + reg = <0xf9016000 0x30>; + clocks = <&xo_board>; + clock-names = "xo"; + clock-output-names = "hfpll_l2"; + }; + +2) An HFPLL for CPU0. This HFPLL has the alias register region. + + clock-controller@f908a000 { + compatible = "qcom,hfpll-ipq8064", "qcom,hfpll"; + reg = <0xf908a000 0x30>, <0xf900a000 0x30>; + clocks = <&xo_board>; + clock-names = "xo"; + clock-output-names = "hfpll0"; + }; diff --git a/Documentation/devicetree/bindings/clock/qcom,krait-cc.txt b/Documentation/devicetree/bindings/clock/qcom,krait-cc.txt new file mode 100644 index 000000000000..030ba60dab08 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,krait-cc.txt @@ -0,0 +1,34 @@ +Krait Clock Controller + +PROPERTIES + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,krait-cc-v1" + "qcom,krait-cc-v2" + +- #clock-cells: + Usage: required + Value type: + Definition: must be 1 + +- clocks: + Usage: required + Value type: + Definition: reference to the clock parents of hfpll, secondary muxes. + +- clock-names: + Usage: required + Value type: + Definition: must be "hfpll0", "hfpll1", "acpu0_aux", "acpu1_aux", "qsb". + +Example: + + kraitcc: clock-controller { + compatible = "qcom,krait-cc-v1"; + clocks = <&hfpll0>, <&hfpll1>, <&acpu0_aux>, <&acpu1_aux>, ; + clock-names = "hfpll0", "hfpll1", "acpu0_aux", "acpu1_aux", "qsb"; + #clock-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt index db542abadb75..916a601b76a7 100644 --- a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt +++ b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt @@ -13,9 +13,13 @@ They provide the following functionalities: Required Properties: - compatible: Must be one of: + - "renesas,r7s9210-cpg-mssr" for the r7s9210 SoC (RZ/A2) - "renesas,r8a7743-cpg-mssr" for the r8a7743 SoC (RZ/G1M) + - "renesas,r8a7744-cpg-mssr" for the r8a7744 SoC (RZ/G1N) - "renesas,r8a7745-cpg-mssr" for the r8a7745 SoC (RZ/G1E) - "renesas,r8a77470-cpg-mssr" for the r8a77470 SoC (RZ/G1C) + - "renesas,r8a774a1-cpg-mssr" for the r8a774a1 SoC (RZ/G2M) + - "renesas,r8a774c0-cpg-mssr" for the r8a774c0 SoC (RZ/G2E) - "renesas,r8a7790-cpg-mssr" for the r8a7790 SoC (R-Car H2) - "renesas,r8a7791-cpg-mssr" for the r8a7791 SoC (R-Car M2-W) - "renesas,r8a7792-cpg-mssr" for the r8a7792 SoC (R-Car V2H) @@ -35,12 +39,13 @@ Required Properties: - clocks: References to external parent clocks, one entry for each entry in clock-names - clock-names: List of external parent clock names. Valid names are: - - "extal" (r8a7743, r8a7745, r8a77470, r8a7790, r8a7791, r8a7792, - r8a7793, r8a7794, r8a7795, r8a7796, r8a77965, r8a77970, - r8a77980, r8a77990, r8a77995) - - "extalr" (r8a7795, r8a7796, r8a77965, r8a77970, r8a77980) - - "usb_extal" (r8a7743, r8a7745, r8a77470, r8a7790, r8a7791, r8a7793, - r8a7794) + - "extal" (r7s9210, r8a7743, r8a7744, r8a7745, r8a77470, r8a774a1, + r8a774c0, r8a7790, r8a7791, r8a7792, r8a7793, r8a7794, + r8a7795, r8a7796, r8a77965, r8a77970, r8a77980, r8a77990, + r8a77995) + - "extalr" (r8a774a1, r8a7795, r8a7796, r8a77965, r8a77970, r8a77980) + - "usb_extal" (r8a7743, r8a7744, r8a7745, r8a77470, r8a7790, r8a7791, + r8a7793, r8a7794) - #clock-cells: Must be 2 - For CPG core clocks, the two clock specifier cells must be "CPG_CORE" diff --git a/Documentation/devicetree/bindings/connector/usb-connector.txt b/Documentation/devicetree/bindings/connector/usb-connector.txt index 8855bfcfd778..d90e17e2428b 100644 --- a/Documentation/devicetree/bindings/connector/usb-connector.txt +++ b/Documentation/devicetree/bindings/connector/usb-connector.txt @@ -29,15 +29,15 @@ Required properties for usb-c-connector with power delivery support: in "Universal Serial Bus Power Delivery Specification" chapter 6.4.1.2 Source_Capabilities Message, the order of each entry(PDO) should follow the PD spec chapter 6.4.1. Required for power source and power dual role. - User can specify the source PDO array via PDO_FIXED/BATT/VAR() defined in - dt-bindings/usb/pd.h. + User can specify the source PDO array via PDO_FIXED/BATT/VAR/PPS_APDO() + defined in dt-bindings/usb/pd.h. - sink-pdos: An array of u32 with each entry providing supported power sink data object(PDO), the detailed bit definitions of PDO can be found in "Universal Serial Bus Power Delivery Specification" chapter 6.4.1.3 Sink Capabilities Message, the order of each entry(PDO) should follow the PD spec chapter 6.4.1. Required for power sink and power dual role. - User can specify the sink PDO array via PDO_FIXED/BATT/VAR() defined in - dt-bindings/usb/pd.h. + User can specify the sink PDO array via PDO_FIXED/BATT/VAR/PPS_APDO() defined + in dt-bindings/usb/pd.h. - op-sink-microwatt: Sink required operating power in microwatt, if source can't offer the power, Capability Mismatch is set. Required for power sink and power dual role. diff --git a/Documentation/devicetree/bindings/crypto/hisilicon,hip07-sec.txt b/Documentation/devicetree/bindings/crypto/hisilicon,hip07-sec.txt index 78d2db9d4de5..d28fd1af01b4 100644 --- a/Documentation/devicetree/bindings/crypto/hisilicon,hip07-sec.txt +++ b/Documentation/devicetree/bindings/crypto/hisilicon,hip07-sec.txt @@ -24,7 +24,7 @@ Optional properties: Example: -p1_sec_a: crypto@400,d2000000 { +p1_sec_a: crypto@400d2000000 { compatible = "hisilicon,hip07-sec"; reg = <0x400 0xd0000000 0x0 0x10000 0x400 0xd2000000 0x0 0x10000 diff --git a/Documentation/devicetree/bindings/csky/cpus.txt b/Documentation/devicetree/bindings/csky/cpus.txt new file mode 100644 index 000000000000..ae79412f2680 --- /dev/null +++ b/Documentation/devicetree/bindings/csky/cpus.txt @@ -0,0 +1,73 @@ +================== +C-SKY CPU Bindings +================== + +The device tree allows to describe the layout of CPUs in a system through +the "cpus" node, which in turn contains a number of subnodes (ie "cpu") +defining properties for every cpu. + +Only SMP system need to care about the cpus node and single processor +needn't define cpus node at all. + +===================================== +cpus and cpu node bindings definition +===================================== + +- cpus node + + Description: Container of cpu nodes + + The node name must be "cpus". + + A cpus node must define the following properties: + + - #address-cells + Usage: required + Value type: + Definition: must be set to 1 + - #size-cells + Usage: required + Value type: + Definition: must be set to 0 + +- cpu node + + Description: Describes one of SMP cores + + PROPERTIES + + - device_type + Usage: required + Value type: + Definition: must be "cpu" + - reg + Usage: required + Value type: + Definition: CPU index + - compatible: + Usage: required + Value type: + Definition: must contain "csky", eg: + "csky,610" + "csky,807" + "csky,810" + "csky,860" + +Example: +-------- + + cpus { + #address-cells = <1>; + #size-cells = <0>; + cpu@0 { + device_type = "cpu"; + reg = <0>; + status = "ok"; + }; + + cpu@1 { + device_type = "cpu"; + reg = <1>; + status = "ok"; + }; + }; diff --git a/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt b/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt index 82f2acb3d374..0398aec488ac 100644 --- a/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt +++ b/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt @@ -15,6 +15,13 @@ Required children nodes: to external devices using the OF graph reprensentation (see ../graph.txt). At least one port node is required. +Optional properties in grandchild nodes: + Any endpoint grandchild node may specify a desired video interface + according to ../../media/video-interfaces.txt, specifically + - bus-width: recognized values are <12>, <16>, <18> and <24>, and + override any output mode selection heuristic, forcing "rgb444", + "rgb565", "rgb666" and "rgb888" respectively. + Example: hlcdc: hlcdc@f0030000 { @@ -50,3 +57,19 @@ Example: #pwm-cells = <3>; }; }; + +Example 2: With a video interface override to force rgb565; as above +but with these changes/additions: + + &hlcdc { + hlcdc-display-controller { + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_lcd_base &pinctrl_lcd_rgb565>; + + port@0 { + hlcdc_panel_output: endpoint@0 { + bus-width = <16>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/display/bridge/lvds-transmitter.txt b/Documentation/devicetree/bindings/display/bridge/lvds-transmitter.txt index fd39ad34c383..50220190c203 100644 --- a/Documentation/devicetree/bindings/display/bridge/lvds-transmitter.txt +++ b/Documentation/devicetree/bindings/display/bridge/lvds-transmitter.txt @@ -22,7 +22,13 @@ among others. Required properties: -- compatible: Must be "lvds-encoder" +- compatible: Must be one or more of the following + - "ti,ds90c185" for the TI DS90C185 FPD-Link Serializer + - "lvds-encoder" for a generic LVDS encoder device + + When compatible with the generic version, nodes must list the + device-specific version corresponding to the device first + followed by the generic version. Required nodes: diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt index 4f0ab3ed3b6f..3aeb0ec06fd0 100644 --- a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt +++ b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt @@ -14,10 +14,22 @@ Required properties: - "renesas,r8a7795-lvds" for R8A7795 (R-Car H3) compatible LVDS encoders - "renesas,r8a7796-lvds" for R8A7796 (R-Car M3-W) compatible LVDS encoders - "renesas,r8a77970-lvds" for R8A77970 (R-Car V3M) compatible LVDS encoders + - "renesas,r8a77980-lvds" for R8A77980 (R-Car V3H) compatible LVDS encoders + - "renesas,r8a77990-lvds" for R8A77990 (R-Car E3) compatible LVDS encoders - "renesas,r8a77995-lvds" for R8A77995 (R-Car D3) compatible LVDS encoders - reg: Base address and length for the memory-mapped registers -- clocks: A phandle + clock-specifier pair for the functional clock +- clocks: A list of phandles + clock-specifier pairs, one for each entry in + the clock-names property. +- clock-names: Name of the clocks. This property is model-dependent. + - The functional clock, which mandatory for all models, shall be listed + first, and shall be named "fck". + - On R8A77990 and R8A77995, the LVDS encoder can use the EXTAL or + DU_DOTCLKINx clocks. Those clocks are optional. When supplied they must be + named "extal" and "dclkin.x" respectively, with "x" being the DU_DOTCLKIN + numerical index. + - When the clocks property only contains the functional clock, the + clock-names property may be omitted. - resets: A phandle + reset specifier for the module reset Required nodes: diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.txt b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.txt new file mode 100644 index 000000000000..0a3fbb53a16e --- /dev/null +++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.txt @@ -0,0 +1,87 @@ +SN65DSI86 DSI to eDP bridge chip +-------------------------------- + +This is the binding for Texas Instruments SN65DSI86 bridge. +http://www.ti.com/general/docs/lit/getliterature.tsp?genericPartNumber=sn65dsi86&fileType=pdf + +Required properties: +- compatible: Must be "ti,sn65dsi86" +- reg: i2c address of the chip, 0x2d as per datasheet +- enable-gpios: gpio specification for bridge_en pin (active high) + +- vccio-supply: A 1.8V supply that powers up the digital IOs. +- vpll-supply: A 1.8V supply that powers up the displayport PLL. +- vcca-supply: A 1.2V supply that powers up the analog circuits. +- vcc-supply: A 1.2V supply that powers up the digital core. + +Optional properties: +- interrupts-extended: Specifier for the SN65DSI86 interrupt line. + +- gpio-controller: Marks the device has a GPIO controller. +- #gpio-cells : Should be two. The first cell is the pin number and + the second cell is used to specify flags. + See ../../gpio/gpio.txt for more information. +- #pwm-cells : Should be one. See ../../pwm/pwm.txt for description of + the cell formats. + +- clock-names: should be "refclk" +- clocks: Specification for input reference clock. The reference + clock rate must be 12 MHz, 19.2 MHz, 26 MHz, 27 MHz or 38.4 MHz. + +- data-lanes: See ../../media/video-interface.txt +- lane-polarities: See ../../media/video-interface.txt + +- suspend-gpios: specification for GPIO1 pin on bridge (active low) + +Required nodes: +This device has two video ports. Their connections are modelled using the +OF graph bindings specified in Documentation/devicetree/bindings/graph.txt. + +- Video port 0 for DSI input +- Video port 1 for eDP output + +Example +------- + +edp-bridge@2d { + compatible = "ti,sn65dsi86"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x2d>; + + enable-gpios = <&msmgpio 33 GPIO_ACTIVE_HIGH>; + suspend-gpios = <&msmgpio 34 GPIO_ACTIVE_LOW>; + + interrupts-extended = <&gpio3 4 IRQ_TYPE_EDGE_FALLING>; + + vccio-supply = <&pm8916_l17>; + vcca-supply = <&pm8916_l6>; + vpll-supply = <&pm8916_l17>; + vcc-supply = <&pm8916_l6>; + + clock-names = "refclk"; + clocks = <&input_refclk>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + + edp_bridge_in: endpoint { + remote-endpoint = <&dsi_out>; + }; + }; + + port@1 { + reg = <1>; + + edp_bridge_out: endpoint { + data-lanes = <2 1 3 0>; + lane-polarities = <0 1 0 1>; + remote-endpoint = <&edp_panel_in>; + }; + }; + }; +} diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt new file mode 100644 index 000000000000..8f9abf28a8fa --- /dev/null +++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt @@ -0,0 +1,35 @@ +TC358764 MIPI-DSI to LVDS panel bridge + +Required properties: + - compatible: "toshiba,tc358764" + - reg: the virtual channel number of a DSI peripheral + - vddc-supply: core voltage supply, 1.2V + - vddio-supply: I/O voltage supply, 1.8V or 3.3V + - vddlvds-supply: LVDS1/2 voltage supply, 3.3V + - reset-gpios: a GPIO spec for the reset pin + +The device node can contain following 'port' child nodes, +according to the OF graph bindings defined in [1]: + 0: DSI Input, not required, if the bridge is DSI controlled + 1: LVDS Output, mandatory + +[1]: Documentation/devicetree/bindings/media/video-interfaces.txt + +Example: + + bridge@0 { + reg = <0>; + compatible = "toshiba,tc358764"; + vddc-supply = <&vcc_1v2_reg>; + vddio-supply = <&vcc_1v8_reg>; + vddlvds-supply = <&vcc_3v3_reg>; + reset-gpios = <&gpd1 6 GPIO_ACTIVE_LOW>; + #address-cells = <1>; + #size-cells = <0>; + port@1 { + reg = <1>; + lvds_ep: endpoint { + remote-endpoint = <&panel_ep>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt b/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt index 2fff8b406f4c..be377786e8cd 100644 --- a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt +++ b/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt @@ -21,6 +21,9 @@ Required properties: - samsung,pll-clock-frequency: specifies frequency of the oscillator clock - #address-cells, #size-cells: should be set respectively to <1> and <0> according to DSI host bindings (see MIPI DSI bindings [1]) + - samsung,burst-clock-frequency: specifies DSI frequency in high-speed burst + mode + - samsung,esc-clock-frequency: specifies DSI frequency in escape mode Optional properties: - power-domains: a phandle to DSIM power domain node @@ -29,25 +32,9 @@ Child nodes: Should contain DSI peripheral nodes (see MIPI DSI bindings [1]). Video interfaces: - Device node can contain video interface port nodes according to [2]. - The following are properties specific to those nodes: - - port node inbound: - - reg: (required) must be 0. - port node outbound: - - reg: (required) must be 1. - - endpoint node connected from mic node (reg = 0): - - remote-endpoint: specifies the endpoint in mic node. This node is required - for Exynos5433 mipi dsi. So mic can access to panel node - throughout this dsi node. - endpoint node connected to panel node (reg = 1): - - remote-endpoint: specifies the endpoint in panel node. This node is - required in all kinds of exynos mipi dsi to represent - the connection between mipi dsi and panel. - - samsung,burst-clock-frequency: specifies DSI frequency in high-speed burst - mode - - samsung,esc-clock-frequency: specifies DSI frequency in escape mode + Device node can contain following video interface port nodes according to [2]: + 0: RGB input, + 1: DSI output [1]: Documentation/devicetree/bindings/display/mipi-dsi-bus.txt [2]: Documentation/devicetree/bindings/media/video-interfaces.txt diff --git a/Documentation/devicetree/bindings/display/mipi-dsi-bus.txt b/Documentation/devicetree/bindings/display/mipi-dsi-bus.txt index 973c27273772..a336599f6c03 100644 --- a/Documentation/devicetree/bindings/display/mipi-dsi-bus.txt +++ b/Documentation/devicetree/bindings/display/mipi-dsi-bus.txt @@ -16,7 +16,7 @@ The following assumes that only a single peripheral is connected to a DSI host. Experience shows that this is true for the large majority of setups. DSI host --------- +======== In addition to the standard properties and those defined by the parent bus of a DSI host, the following properties apply to a node representing a DSI host. @@ -29,12 +29,24 @@ Required properties: - #size-cells: Should be 0. There are cases where it makes sense to use a different value here. See below. +Optional properties: +- clock-master: boolean. Should be enabled if the host is being used in + conjunction with another DSI host to drive the same peripheral. Hardware + supporting such a configuration generally requires the data on both the busses + to be driven by the same clock. Only the DSI host instance controlling this + clock should contain this property. + DSI peripheral --------------- +============== + +Peripherals with DSI as control bus, or no control bus +------------------------------------------------------ -Peripherals are represented as child nodes of the DSI host's node. Properties -described here apply to all DSI peripherals, but individual bindings may want -to define additional, device-specific properties. +Peripherals with the DSI bus as the primary control bus, or peripherals with +no control bus but use the DSI bus to transmit pixel data are represented +as child nodes of the DSI host's node. Properties described here apply to all +DSI peripherals, but individual bindings may want to define additional, +device-specific properties. Required properties: - reg: The virtual channel number of a DSI peripheral. Must be in the range @@ -49,9 +61,37 @@ case two alternative representations can be chosen: property is the number of the first virtual channel and the second cell is the number of consecutive virtual channels. -Example -------- - +Peripherals with a different control bus +---------------------------------------- + +There are peripherals that have I2C/SPI (or some other non-DSI bus) as the +primary control bus, but are also connected to a DSI bus (mostly for the data +path). Connections between such peripherals and a DSI host can be represented +using the graph bindings [1], [2]. + +Peripherals that support dual channel DSI +----------------------------------------- + +Peripherals with higher bandwidth requirements can be connected to 2 DSI +busses. Each DSI bus/channel drives some portion of the pixel data (generally +left/right half of each line of the display, or even/odd lines of the display). +The graph bindings should be used to represent the multiple DSI busses that are +connected to this peripheral. Each DSI host's output endpoint can be linked to +an input endpoint of the DSI peripheral. + +[1] Documentation/devicetree/bindings/graph.txt +[2] Documentation/devicetree/bindings/media/video-interfaces.txt + +Examples +======== +- (1), (2) and (3) are examples of a DSI host and peripheral on the DSI bus + with different virtual channel configurations. +- (4) is an example of a peripheral on a I2C control bus connected to a + DSI host using of-graph bindings. +- (5) is an example of 2 DSI hosts driving a dual-channel DSI peripheral, + which uses I2C as its primary control bus. + +1) dsi-host { ... @@ -67,6 +107,7 @@ Example ... }; +2) dsi-host { ... @@ -82,6 +123,7 @@ Example ... }; +3) dsi-host { ... @@ -96,3 +138,98 @@ Example ... }; + +4) + i2c-host { + ... + + dsi-bridge@35 { + compatible = "..."; + reg = <0x35>; + + ports { + ... + + port { + bridge_mipi_in: endpoint { + remote-endpoint = <&host_mipi_out>; + }; + }; + }; + }; + }; + + dsi-host { + ... + + ports { + ... + + port { + host_mipi_out: endpoint { + remote-endpoint = <&bridge_mipi_in>; + }; + }; + }; + }; + +5) + i2c-host { + dsi-bridge@35 { + compatible = "..."; + reg = <0x35>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + dsi0_in: endpoint { + remote-endpoint = <&dsi0_out>; + }; + }; + + port@1 { + reg = <1>; + dsi1_in: endpoint { + remote-endpoint = <&dsi1_out>; + }; + }; + }; + }; + }; + + dsi0-host { + ... + + /* + * this DSI instance drives the clock for both the host + * controllers + */ + clock-master; + + ports { + ... + + port { + dsi0_out: endpoint { + remote-endpoint = <&dsi0_in>; + }; + }; + }; + }; + + dsi1-host { + ... + + ports { + ... + + port { + dsi1_out: endpoint { + remote-endpoint = <&dsi1_in>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/display/panel/innolux,tv123wam.txt b/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.txt similarity index 70% rename from Documentation/devicetree/bindings/display/panel/innolux,tv123wam.txt rename to Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.txt index a9b35265fa13..513f03466aba 100644 --- a/Documentation/devicetree/bindings/display/panel/innolux,tv123wam.txt +++ b/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.txt @@ -1,20 +1,22 @@ -Innolux TV123WAM 12.3 inch eDP 2K display panel +Innolux P120ZDG-BF1 12.02 inch eDP 2K display panel This binding is compatible with the simple-panel binding, which is specified in simple-panel.txt in this directory. Required properties: -- compatible: should be "innolux,tv123wam" +- compatible: should be "innolux,p120zdg-bf1" - power-supply: regulator to provide the supply voltage Optional properties: - enable-gpios: GPIO pin to enable or disable the panel - backlight: phandle of the backlight device attached to the panel +- no-hpd: If HPD isn't hooked up; add this property. Example: panel_edp: panel-edp { - compatible = "innolux,tv123wam"; + compatible = "innolux,p120zdg-bf1"; enable-gpios = <&msmgpio 31 GPIO_ACTIVE_LOW>; power-supply = <&pm8916_l2>; backlight = <&backlight>; + no-hpd; }; diff --git a/Documentation/devicetree/bindings/display/panel/simple-panel.txt b/Documentation/devicetree/bindings/display/panel/simple-panel.txt index 45a457ad38f0..b2b872c710f2 100644 --- a/Documentation/devicetree/bindings/display/panel/simple-panel.txt +++ b/Documentation/devicetree/bindings/display/panel/simple-panel.txt @@ -11,6 +11,9 @@ Optional properties: - ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing - enable-gpios: GPIO pin to enable or disable the panel - backlight: phandle of the backlight device attached to the panel +- no-hpd: This panel is supposed to communicate that it's ready via HPD + (hot plug detect) signal, but the signal isn't hooked up so we should + hardcode the max delay from the panel spec when powering up the panel. Example: diff --git a/Documentation/devicetree/bindings/display/renesas,du.txt b/Documentation/devicetree/bindings/display/renesas,du.txt index ec9d34be2ff7..9de67be632d1 100644 --- a/Documentation/devicetree/bindings/display/renesas,du.txt +++ b/Documentation/devicetree/bindings/display/renesas,du.txt @@ -15,6 +15,8 @@ Required Properties: - "renesas,du-r8a7796" for R8A7796 (R-Car M3-W) compatible DU - "renesas,du-r8a77965" for R8A77965 (R-Car M3-N) compatible DU - "renesas,du-r8a77970" for R8A77970 (R-Car V3M) compatible DU + - "renesas,du-r8a77980" for R8A77980 (R-Car V3H) compatible DU + - "renesas,du-r8a77990" for R8A77990 (R-Car E3) compatible DU - "renesas,du-r8a77995" for R8A77995 (R-Car D3) compatible DU - reg: the memory-mapped I/O registers base address and length @@ -61,6 +63,8 @@ corresponding to each DU output. R8A7796 (R-Car M3-W) DPAD 0 HDMI 0 LVDS 0 - R8A77965 (R-Car M3-N) DPAD 0 HDMI 0 LVDS 0 - R8A77970 (R-Car V3M) DPAD 0 LVDS 0 - - + R8A77980 (R-Car V3H) DPAD 0 LVDS 0 - - + R8A77990 (R-Car E3) DPAD 0 LVDS 0 LVDS 1 - R8A77995 (R-Car D3) DPAD 0 LVDS 0 LVDS 1 - diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt index eeda3597011e..b79e5769f0ae 100644 --- a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt @@ -8,6 +8,9 @@ Required properties: - compatible: value should be one of the following "rockchip,rk3036-vop"; "rockchip,rk3126-vop"; + "rockchip,px30-vop-lit"; + "rockchip,px30-vop-big"; + "rockchip,rk3188-vop"; "rockchip,rk3288-vop"; "rockchip,rk3368-vop"; "rockchip,rk3366-vop"; diff --git a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt index f8773ecb7525..7854fff4fc16 100644 --- a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt +++ b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt @@ -78,6 +78,7 @@ Required properties: - compatible: value must be one of: * "allwinner,sun8i-a83t-dw-hdmi" + * "allwinner,sun50i-a64-dw-hdmi", "allwinner,sun8i-a83t-dw-hdmi" - reg: base address and size of memory-mapped region - reg-io-width: See dw_hdmi.txt. Shall be 1. - interrupts: HDMI interrupt number @@ -96,6 +97,9 @@ Required properties: first port should be the input endpoint. The second should be the output, usually to an HDMI connector. +Optional properties: + - hvcc-supply: the VCC power supply of the controller + DWC HDMI PHY ------------ @@ -103,6 +107,7 @@ Required properties: - compatible: value must be one of: * allwinner,sun8i-a83t-hdmi-phy * allwinner,sun8i-h3-hdmi-phy + * allwinner,sun8i-r40-hdmi-phy * allwinner,sun50i-a64-hdmi-phy - reg: base address and size of memory-mapped region - clocks: phandles to the clocks feeding the HDMI PHY @@ -112,9 +117,9 @@ Required properties: - resets: phandle to the reset controller driving the PHY - reset-names: must be "phy" -H3 and A64 HDMI PHY require additional clocks: +H3, A64 and R40 HDMI PHY require additional clocks: - pll-0: parent of phy clock - - pll-1: second possible phy clock parent (A64 only) + - pll-1: second possible phy clock parent (A64/R40 only) TV Encoder ---------- @@ -151,6 +156,8 @@ Required properties: * allwinner,sun8i-v3s-tcon * allwinner,sun9i-a80-tcon-lcd * allwinner,sun9i-a80-tcon-tv + * "allwinner,sun50i-a64-tcon-lcd", "allwinner,sun8i-a83t-tcon-lcd" + * "allwinner,sun50i-a64-tcon-tv", "allwinner,sun8i-a83t-tcon-tv" - reg: base address and size of memory-mapped region - interrupts: interrupt associated to this IP - clocks: phandles to the clocks feeding the TCON. @@ -369,7 +376,11 @@ Required properties: * allwinner,sun8i-a83t-de2-mixer-0 * allwinner,sun8i-a83t-de2-mixer-1 * allwinner,sun8i-h3-de2-mixer-0 + * allwinner,sun8i-r40-de2-mixer-0 + * allwinner,sun8i-r40-de2-mixer-1 * allwinner,sun8i-v3s-de2-mixer + * allwinner,sun50i-a64-de2-mixer-0 + * allwinner,sun50i-a64-de2-mixer-1 - reg: base address and size of the memory-mapped region. - clocks: phandles to the clocks feeding the mixer * bus: the mixer interface clock @@ -403,6 +414,7 @@ Required properties: * allwinner,sun8i-r40-display-engine * allwinner,sun8i-v3s-display-engine * allwinner,sun9i-a80-display-engine + * allwinner,sun50i-a64-display-engine - allwinner,pipelines: list of phandle to the display engine frontends (DE 1.0) or mixers (DE 2.0) available. diff --git a/Documentation/devicetree/bindings/dma/jz4780-dma.txt b/Documentation/devicetree/bindings/dma/jz4780-dma.txt index 03e9cf7b42e0..636fcb26b164 100644 --- a/Documentation/devicetree/bindings/dma/jz4780-dma.txt +++ b/Documentation/devicetree/bindings/dma/jz4780-dma.txt @@ -2,8 +2,13 @@ Required properties: -- compatible: Should be "ingenic,jz4780-dma" -- reg: Should contain the DMA controller registers location and length. +- compatible: Should be one of: + * ingenic,jz4740-dma + * ingenic,jz4725b-dma + * ingenic,jz4770-dma + * ingenic,jz4780-dma +- reg: Should contain the DMA channel registers location and length, followed + by the DMA controller registers location and length. - interrupts: Should contain the interrupt specifier of the DMA controller. - clocks: Should contain a clock specifier for the JZ4780 PDMA clock. - #dma-cells: Must be <2>. Number of integer cells in the dmas property of @@ -19,9 +24,10 @@ Optional properties: Example: -dma: dma@13420000 { +dma: dma-controller@13420000 { compatible = "ingenic,jz4780-dma"; - reg = <0x13420000 0x10000>; + reg = <0x13420000 0x400 + 0x13421000 0x40>; interrupt-parent = <&intc>; interrupts = <10>; diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt index 946229c48657..a5a7c3f5a1e3 100644 --- a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt +++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt @@ -17,6 +17,7 @@ Required Properties: - compatible: "renesas,dmac-", "renesas,rcar-dmac" as fallback. Examples with soctypes are: - "renesas,dmac-r8a7743" (RZ/G1M) + - "renesas,dmac-r8a7744" (RZ/G1N) - "renesas,dmac-r8a7745" (RZ/G1E) - "renesas,dmac-r8a77470" (RZ/G1C) - "renesas,dmac-r8a7790" (R-Car H2) diff --git a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt index 482e54362d3e..1743017bd948 100644 --- a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt +++ b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt @@ -4,6 +4,7 @@ Required Properties: -compatible: "renesas,-usb-dmac", "renesas,usb-dmac" as fallback. Examples with soctypes are: - "renesas,r8a7743-usb-dmac" (RZ/G1M) + - "renesas,r8a7744-usb-dmac" (RZ/G1N) - "renesas,r8a7745-usb-dmac" (RZ/G1E) - "renesas,r8a7790-usb-dmac" (R-Car H2) - "renesas,r8a7791-usb-dmac" (R-Car M2-W) diff --git a/Documentation/devicetree/bindings/firmware/qcom,scm.txt b/Documentation/devicetree/bindings/firmware/qcom,scm.txt index fcf6979c0b6d..41f133a4e2fa 100644 --- a/Documentation/devicetree/bindings/firmware/qcom,scm.txt +++ b/Documentation/devicetree/bindings/firmware/qcom,scm.txt @@ -7,16 +7,23 @@ assorted actions. Required properties: - compatible: must contain one of the following: - * "qcom,scm-apq8064" for APQ8064 platforms - * "qcom,scm-msm8660" for MSM8660 platforms - * "qcom,scm-msm8690" for MSM8690 platforms - * "qcom,scm-msm8996" for MSM8996 platforms - * "qcom,scm-ipq4019" for IPQ4019 platforms - * "qcom,scm" for later processors (MSM8916, APQ8084, MSM8974, etc) -- clocks: One to three clocks may be required based on compatible. - * No clock required for "qcom,scm-msm8996", "qcom,scm-ipq4019" - * Only core clock required for "qcom,scm-apq8064", "qcom,scm-msm8660", and "qcom,scm-msm8960" - * Core, iface, and bus clocks required for "qcom,scm" + * "qcom,scm-apq8064" + * "qcom,scm-apq8084" + * "qcom,scm-msm8660" + * "qcom,scm-msm8916" + * "qcom,scm-msm8960" + * "qcom,scm-msm8974" + * "qcom,scm-msm8996" + * "qcom,scm-msm8998" + * "qcom,scm-ipq4019" + * "qcom,scm-sdm845" + and: + * "qcom,scm" +- clocks: Specifies clocks needed by the SCM interface, if any: + * core clock required for "qcom,scm-apq8064", "qcom,scm-msm8660" and + "qcom,scm-msm8960" + * core, iface and bus clocks required for "qcom,scm-apq8084", + "qcom,scm-msm8916" and "qcom,scm-msm8974" - clock-names: Must contain "core" for the core clock, "iface" for the interface clock and "bus" for the bus clock per the requirements of the compatible. - qcom,dload-mode: phandle to the TCSR hardware block and offset of the @@ -26,8 +33,10 @@ Example for MSM8916: firmware { scm { - compatible = "qcom,scm"; - clocks = <&gcc GCC_CRYPTO_CLK> , <&gcc GCC_CRYPTO_AXI_CLK>, <&gcc GCC_CRYPTO_AHB_CLK>; + compatible = "qcom,msm8916", "qcom,scm"; + clocks = <&gcc GCC_CRYPTO_CLK> , + <&gcc GCC_CRYPTO_AXI_CLK>, + <&gcc GCC_CRYPTO_AHB_CLK>; clock-names = "core", "bus", "iface"; }; }; diff --git a/Documentation/devicetree/bindings/firmware/xilinx/xlnx,zynqmp-firmware.txt b/Documentation/devicetree/bindings/firmware/xilinx/xlnx,zynqmp-firmware.txt new file mode 100644 index 000000000000..614bac55df86 --- /dev/null +++ b/Documentation/devicetree/bindings/firmware/xilinx/xlnx,zynqmp-firmware.txt @@ -0,0 +1,82 @@ +----------------------------------------------------------------- +Device Tree Bindings for the Xilinx Zynq MPSoC Firmware Interface +----------------------------------------------------------------- + +The zynqmp-firmware node describes the interface to platform firmware. +ZynqMP has an interface to communicate with secure firmware. Firmware +driver provides an interface to firmware APIs. Interface APIs can be +used by any driver to communicate to PMUFW(Platform Management Unit). +These requests include clock management, pin control, device control, +power management service, FPGA service and other platform management +services. + +Required properties: + - compatible: Must contain: "xlnx,zynqmp-firmware" + - method: The method of calling the PM-API firmware layer. + Permitted values are: + - "smc" : SMC #0, following the SMCCC + - "hvc" : HVC #0, following the SMCCC + +-------------------------------------------------------------------------- +Device Tree Clock bindings for the Zynq Ultrascale+ MPSoC controlled using +Zynq MPSoC firmware interface +-------------------------------------------------------------------------- +The clock controller is a h/w block of Zynq Ultrascale+ MPSoC clock +tree. It reads required input clock frequencies from the devicetree and acts +as clock provider for all clock consumers of PS clocks. + +See clock_bindings.txt for more information on the generic clock bindings. + +Required properties: + - #clock-cells: Must be 1 + - compatible: Must contain: "xlnx,zynqmp-clk" + - clocks: List of clock specifiers which are external input + clocks to the given clock controller. Please refer + the next section to find the input clocks for a + given controller. + - clock-names: List of clock names which are exteral input clocks + to the given clock controller. Please refer to the + clock bindings for more details. + +Input clocks for zynqmp Ultrascale+ clock controller: + +The Zynq UltraScale+ MPSoC has one primary and four alternative reference clock +inputs. These required clock inputs are: + - pss_ref_clk (PS reference clock) + - video_clk (reference clock for video system ) + - pss_alt_ref_clk (alternative PS reference clock) + - aux_ref_clk + - gt_crx_ref_clk (transceiver reference clock) + +The following strings are optional parameters to the 'clock-names' property in +order to provide an optional (E)MIO clock source: + - swdt0_ext_clk + - swdt1_ext_clk + - gem0_emio_clk + - gem1_emio_clk + - gem2_emio_clk + - gem3_emio_clk + - mio_clk_XX # with XX = 00..77 + - mio_clk_50_or_51 #for the mux clock to gem tsu from 50 or 51 + + +Output clocks are registered based on clock information received +from firmware. Output clocks indexes are mentioned in +include/dt-bindings/clock/xlnx,zynqmp-clk.h. + +------- +Example +------- + +firmware { + zynqmp_firmware: zynqmp-firmware { + compatible = "xlnx,zynqmp-firmware"; + method = "smc"; + zynqmp_clk: clock-controller { + #clock-cells = <1>; + compatible = "xlnx,zynqmp-clk"; + clocks = <&pss_ref_clk>, <&video_clk>, <&pss_alt_ref_clk>, <&aux_ref_clk>, <>_crx_ref_clk>; + clock-names = "pss_ref_clk", "video_clk", "pss_alt_ref_clk","aux_ref_clk", "gt_crx_ref_clk"; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/fpga/fpga-region.txt b/Documentation/devicetree/bindings/fpga/fpga-region.txt index 6db8aeda461a..90c44694a30b 100644 --- a/Documentation/devicetree/bindings/fpga/fpga-region.txt +++ b/Documentation/devicetree/bindings/fpga/fpga-region.txt @@ -415,7 +415,7 @@ DT Overlay contains: firmware-name = "base.rbf"; fpga-bridge@4400 { - compatible = "altr,freeze-bridge"; + compatible = "altr,freeze-bridge-controller"; reg = <0x4400 0x10>; fpga_region1: fpga-region1 { @@ -427,7 +427,7 @@ DT Overlay contains: }; fpga-bridge@4420 { - compatible = "altr,freeze-bridge"; + compatible = "altr,freeze-bridge-controller"; reg = <0x4420 0x10>; fpga_region2: fpga-region2 { diff --git a/Documentation/devicetree/bindings/gpio/gpio.txt b/Documentation/devicetree/bindings/gpio/gpio.txt index a7c31de29362..f0ba154b5723 100644 --- a/Documentation/devicetree/bindings/gpio/gpio.txt +++ b/Documentation/devicetree/bindings/gpio/gpio.txt @@ -1,18 +1,9 @@ Specifying GPIO information for devices -============================================ +======================================= 1) gpios property ----------------- -Nodes that makes use of GPIOs should specify them using one or more -properties, each containing a 'gpio-list': - - gpio-list ::= [gpio-list] - single-gpio ::= - gpio-phandle : phandle to gpio controller node - gpio-specifier : Array of #gpio-cells specifying specific gpio - (controller specific) - GPIO properties should be named "[-]gpios", with being the purpose of this GPIO for the device. While a non-existent is considered valid for compatibility reasons (resolving to the "gpios" property), it is not allowed @@ -33,33 +24,27 @@ The following example could be used to describe GPIO pins used as device enable and bit-banged data signals: gpio1: gpio1 { - gpio-controller - #gpio-cells = <2>; - }; - gpio2: gpio2 { - gpio-controller - #gpio-cells = <1>; + gpio-controller; + #gpio-cells = <2>; }; [...] - enable-gpios = <&gpio2 2>; data-gpios = <&gpio1 12 0>, <&gpio1 13 0>, <&gpio1 14 0>, <&gpio1 15 0>; -Note that gpio-specifier length is controller dependent. In the -above example, &gpio1 uses 2 cells to specify a gpio, while &gpio2 -only uses one. +In the above example, &gpio1 uses 2 cells to specify a gpio. The first cell is +a local offset to the GPIO line and the second cell represent consumer flags, +such as if the consumer desire the line to be active low (inverted) or open +drain. This is the recommended practice. -gpio-specifier may encode: bank, pin position inside the bank, -whether pin is open-drain and whether pin is logically inverted. +The exact meaning of each specifier cell is controller specific, and must be +documented in the device tree binding for the device, but it is strongly +recommended to use the two-cell approach. -Exact meaning of each specifier cell is controller specific, and must -be documented in the device tree binding for the device. - -Most controllers are however specifying a generic flag bitfield -in the last cell, so for these, use the macros defined in +Most controllers are specifying a generic flag bitfield in the last cell, so +for these, use the macros defined in include/dt-bindings/gpio/gpio.h whenever possible: Example of a node using GPIOs: @@ -236,46 +221,40 @@ Example of two SOC GPIO banks defined as gpio-controller nodes: Some or all of the GPIOs provided by a GPIO controller may be routed to pins on the package via a pin controller. This allows muxing those pins between -GPIO and other functions. +GPIO and other functions. It is a fairly common practice among silicon +engineers. + +2.2) Ordinary (numerical) GPIO ranges +------------------------------------- It is useful to represent which GPIOs correspond to which pins on which pin -controllers. The gpio-ranges property described below represents this, and -contains information structures as follows: - - gpio-range-list ::= [gpio-range-list] - single-gpio-range ::= | - numeric-gpio-range ::= - - named-gpio-range ::= '<0 0>' - pinctrl-phandle : phandle to pin controller node - gpio-base : Base GPIO ID in the GPIO controller - pinctrl-base : Base pinctrl pin ID in the pin controller - count : The number of GPIOs/pins in this range - -The "pin controller node" mentioned above must conform to the bindings -described in ../pinctrl/pinctrl-bindings.txt. - -In case named gpio ranges are used (ranges with both and - set to 0), the property gpio-ranges-group-names contains one string -for every single-gpio-range in gpio-ranges: - gpiorange-names-list ::= [gpiorange-names-list] - gpiorange-name : Name of the pingroup associated to the GPIO range in - the respective pin controller. - -Elements of gpiorange-names-list corresponding to numeric ranges contain -the empty string. Elements of gpiorange-names-list corresponding to named -ranges contain the name of a pin group defined in the respective pin -controller. The number of pins/GPIOs in the range is the number of pins in -that pin group. +controllers. The gpio-ranges property described below represents this with +a discrete set of ranges mapping pins from the pin controller local number space +to pins in the GPIO controller local number space. -Previous versions of this binding required all pin controller nodes that -were referenced by any gpio-ranges property to contain a property named -#gpio-range-cells with value <3>. This requirement is now deprecated. -However, that property may still exist in older device trees for -compatibility reasons, and would still be required even in new device -trees that need to be compatible with older software. +The format is: <[pin controller phandle], [GPIO controller offset], + [pin controller offset], [number of pins]>; + +The GPIO controller offset pertains to the GPIO controller node containing the +range definition. + +The pin controller node referenced by the phandle must conform to the bindings +described in pinctrl/pinctrl-bindings.txt. + +Each offset runs from 0 to N. It is perfectly fine to pile any number of +ranges with just one pin-to-GPIO line mapping if the ranges are concocted, but +in practice these ranges are often lumped in discrete sets. + +Example: + + gpio-ranges = <&foo 0 20 10>, <&bar 10 50 20>; -Example 1: +This means: +- pins 20..29 on pin controller "foo" is mapped to GPIO line 0..9 and +- pins 50..69 on pin controller "bar" is mapped to GPIO line 10..29 + + +Verbose example: qe_pio_e: gpio-controller@1460 { #gpio-cells = <2>; @@ -289,7 +268,28 @@ Here, a single GPIO controller has GPIOs 0..9 routed to pin controller pinctrl1's pins 20..29, and GPIOs 10..29 routed to pin controller pinctrl2's pins 50..69. -Example 2: + +2.3) GPIO ranges from named pin groups +-------------------------------------- + +It is also possible to use pin groups for gpio ranges when pin groups are the +easiest and most convenient mapping. + +Both both and must set to 0 when using named pin groups +names. + +The property gpio-ranges-group-names must contain exactly one string for each +range. + +Elements of gpio-ranges-group-names must contain the name of a pin group +defined in the respective pin controller. The number of pins/GPIO lines in the +range is the number of pins in that pin group. The number of pins of that +group is defined int the implementation and not in the device tree. + +If numerical and named pin groups are mixed, the string corresponding to a +numerical pin range in gpio-ranges-group-names must be empty. + +Example: gpio_pio_i: gpio-controller@14b0 { #gpio-cells = <2>; @@ -306,6 +306,14 @@ Example 2: "bar"; }; -Here, three GPIO ranges are defined wrt. two pin controllers. pinctrl1 GPIO -ranges are defined using pin numbers whereas the GPIO ranges wrt. pinctrl2 -are named "foo" and "bar". +Here, three GPIO ranges are defined referring to two pin controllers. + +pinctrl1 GPIO ranges are defined using pin numbers whereas the GPIO ranges +in pinctrl2 are defined using the pin groups named "foo" and "bar". + +Previous versions of this binding required all pin controller nodes that +were referenced by any gpio-ranges property to contain a property named +#gpio-range-cells with value <3>. This requirement is now deprecated. +However, that property may still exist in older device trees for +compatibility reasons, and would still be required even in new device +trees that need to be compatible with older software. diff --git a/Documentation/devicetree/bindings/gpio/ingenic,gpio.txt b/Documentation/devicetree/bindings/gpio/ingenic,gpio.txt deleted file mode 100644 index 7988aeb725f4..000000000000 --- a/Documentation/devicetree/bindings/gpio/ingenic,gpio.txt +++ /dev/null @@ -1,46 +0,0 @@ -Ingenic jz47xx GPIO controller - -That the Ingenic GPIO driver node must be a sub-node of the Ingenic pinctrl -driver node. - -Required properties: --------------------- - - - compatible: Must contain one of: - - "ingenic,jz4740-gpio" - - "ingenic,jz4770-gpio" - - "ingenic,jz4780-gpio" - - reg: The GPIO bank number. - - interrupt-controller: Marks the device node as an interrupt controller. - - interrupts: Interrupt specifier for the controllers interrupt. - - #interrupt-cells: Should be 2. Refer to - ../interrupt-controller/interrupts.txt for more details. - - gpio-controller: Marks the device node as a GPIO controller. - - #gpio-cells: Should be 2. The first cell is the GPIO number and the second - cell specifies GPIO flags, as defined in . Only the - GPIO_ACTIVE_HIGH and GPIO_ACTIVE_LOW flags are supported. - - gpio-ranges: Range of pins managed by the GPIO controller. Refer to - 'gpio.txt' in this directory for more details. - -Example: --------- - -&pinctrl { - #address-cells = <1>; - #size-cells = <0>; - - gpa: gpio@0 { - compatible = "ingenic,jz4740-gpio"; - reg = <0>; - - gpio-controller; - gpio-ranges = <&pinctrl 0 0 32>; - #gpio-cells = <2>; - - interrupt-controller; - #interrupt-cells = <2>; - - interrupt-parent = <&intc>; - interrupts = <28>; - }; -}; diff --git a/Documentation/devicetree/bindings/gpio/renesas,gpio-rcar.txt b/Documentation/devicetree/bindings/gpio/renesas,gpio-rcar.txt index 4018ee57a6af..2889bbcd7416 100644 --- a/Documentation/devicetree/bindings/gpio/renesas,gpio-rcar.txt +++ b/Documentation/devicetree/bindings/gpio/renesas,gpio-rcar.txt @@ -4,8 +4,10 @@ Required Properties: - compatible: should contain one or more of the following: - "renesas,gpio-r8a7743": for R8A7743 (RZ/G1M) compatible GPIO controller. + - "renesas,gpio-r8a7744": for R8A7744 (RZ/G1N) compatible GPIO controller. - "renesas,gpio-r8a7745": for R8A7745 (RZ/G1E) compatible GPIO controller. - "renesas,gpio-r8a77470": for R8A77470 (RZ/G1C) compatible GPIO controller. + - "renesas,gpio-r8a774a1": for R8A774A1 (RZ/G2M) compatible GPIO controller. - "renesas,gpio-r8a7778": for R8A7778 (R-Car M1) compatible GPIO controller. - "renesas,gpio-r8a7779": for R8A7779 (R-Car H1) compatible GPIO controller. - "renesas,gpio-r8a7790": for R8A7790 (R-Car H2) compatible GPIO controller. @@ -22,7 +24,7 @@ Required Properties: - "renesas,gpio-r8a77995": for R8A77995 (R-Car D3) compatible GPIO controller. - "renesas,rcar-gen1-gpio": for a generic R-Car Gen1 GPIO controller. - "renesas,rcar-gen2-gpio": for a generic R-Car Gen2 or RZ/G1 GPIO controller. - - "renesas,rcar-gen3-gpio": for a generic R-Car Gen3 GPIO controller. + - "renesas,rcar-gen3-gpio": for a generic R-Car Gen3 or RZ/G2 GPIO controller. - "renesas,gpio-rcar": deprecated. When compatible with the generic version nodes must list the @@ -38,7 +40,7 @@ Required Properties: - #gpio-cells: Should be 2. The first cell is the GPIO number and the second cell specifies GPIO flags, as defined in . Only the GPIO_ACTIVE_HIGH and GPIO_ACTIVE_LOW flags are supported. - - gpio-ranges: Range of pins managed by the GPIO controller. + - gpio-ranges: See gpio.txt. Optional properties: @@ -46,35 +48,44 @@ Optional properties: mandatory if the hardware implements a controllable functional clock for the GPIO instance. -Please refer to gpio.txt in this directory for details of gpio-ranges property -and the common GPIO bindings used by client devices. + - gpio-reserved-ranges: See gpio.txt. + +Please refer to gpio.txt in this directory for the common GPIO bindings used by +client devices. The GPIO controller also acts as an interrupt controller. It uses the default two cells specifier as described in Documentation/devicetree/bindings/ interrupt-controller/interrupts.txt. -Example: R8A7779 (R-Car H1) GPIO controller nodes +Example: R8A77470 (RZ/G1C) GPIO controller nodes - gpio0: gpio@ffc40000 { - compatible = "renesas,gpio-r8a7779", "renesas,rcar-gen1-gpio"; - reg = <0xffc40000 0x2c>; - interrupt-parent = <&gic>; - interrupts = <0 141 0x4>; - #gpio-cells = <2>; - gpio-controller; - gpio-ranges = <&pfc 0 0 32>; - interrupt-controller; - #interrupt-cells = <2>; - }; + gpio0: gpio@e6050000 { + compatible = "renesas,gpio-r8a77470", + "renesas,rcar-gen2-gpio"; + reg = <0 0xe6050000 0 0x50>; + interrupts = ; + #gpio-cells = <2>; + gpio-controller; + gpio-ranges = <&pfc 0 0 23>; + #interrupt-cells = <2>; + interrupt-controller; + clocks = <&cpg CPG_MOD 912>; + power-domains = <&sysc R8A77470_PD_ALWAYS_ON>; + resets = <&cpg 912>; + }; ... - gpio6: gpio@ffc46000 { - compatible = "renesas,gpio-r8a7779", "renesas,rcar-gen1-gpio"; - reg = <0xffc46000 0x2c>; - interrupt-parent = <&gic>; - interrupts = <0 147 0x4>; - #gpio-cells = <2>; - gpio-controller; - gpio-ranges = <&pfc 0 192 9>; - interrupt-controller; - #interrupt-cells = <2>; - }; + gpio3: gpio@e6053000 { + compatible = "renesas,gpio-r8a77470", + "renesas,rcar-gen2-gpio"; + reg = <0 0xe6053000 0 0x50>; + interrupts = ; + #gpio-cells = <2>; + gpio-controller; + gpio-ranges = <&pfc 0 96 30>; + gpio-reserved-ranges = <17 10>; + #interrupt-cells = <2>; + interrupt-controller; + clocks = <&cpg CPG_MOD 909>; + power-domains = <&sysc R8A77470_PD_ALWAYS_ON>; + resets = <&cpg 909>; + }; diff --git a/Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt b/Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt new file mode 100644 index 000000000000..1b30812b015b --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt @@ -0,0 +1,21 @@ +Synopsys GPIO via CREG (Control REGisters) driver + +Required properties: +- compatible : "snps,creg-gpio-hsdk" or "snps,creg-gpio-axs10x". +- reg : Exactly one register range with length 0x4. +- #gpio-cells : Since the generic GPIO binding is used, the + amount of cells must be specified as 2. The first cell is the + pin number, the second cell is used to specify optional parameters: + See "gpio-specifier" in .../devicetree/bindings/gpio/gpio.txt. +- gpio-controller : Marks the device node as a GPIO controller. +- ngpios: Number of GPIO pins. + +Example: + +gpio: gpio@f00014b0 { + compatible = "snps,creg-gpio-hsdk"; + reg = <0xf00014b0 0x4>; + gpio-controller; + #gpio-cells = <2>; + ngpios = <2>; +}; diff --git a/Documentation/devicetree/bindings/hwmon/ina3221.txt b/Documentation/devicetree/bindings/hwmon/ina3221.txt new file mode 100644 index 000000000000..a7b25caa2b8e --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/ina3221.txt @@ -0,0 +1,44 @@ +Texas Instruments INA3221 Device Tree Bindings + +1) ina3221 node + Required properties: + - compatible: Must be "ti,ina3221" + - reg: I2C address + + Optional properties: + = The node contains optional child nodes for three channels = + = Each child node describes the information of input source = + + - #address-cells: Required only if a child node is present. Must be 1. + - #size-cells: Required only if a child node is present. Must be 0. + +2) child nodes + Required properties: + - reg: Must be 0, 1 or 2, corresponding to IN1, IN2 or IN3 port of INA3221 + + Optional properties: + - label: Name of the input source + - shunt-resistor-micro-ohms: Shunt resistor value in micro-Ohm + +Example: + +ina3221@40 { + compatible = "ti,ina3221"; + reg = <0x40>; + #address-cells = <1>; + #size-cells = <0>; + + input@0 { + reg = <0x0>; + status = "disabled"; + }; + input@1 { + reg = <0x1>; + shunt-resistor-micro-ohms = <5000>; + }; + input@2 { + reg = <0x2>; + label = "VDD_5V"; + shunt-resistor-micro-ohms = <5000>; + }; +}; diff --git a/Documentation/devicetree/bindings/hwmon/ltc2978.txt b/Documentation/devicetree/bindings/hwmon/ltc2978.txt index bf2a47bbdc58..b428a70a7cc0 100644 --- a/Documentation/devicetree/bindings/hwmon/ltc2978.txt +++ b/Documentation/devicetree/bindings/hwmon/ltc2978.txt @@ -15,6 +15,7 @@ Required properties: * "lltc,ltm2987" * "lltc,ltm4675" * "lltc,ltm4676" + * "lltc,ltm4686" - reg: I2C slave address Optional properties: @@ -30,6 +31,7 @@ Valid names of regulators depend on number of supplies supported per device: * ltc3880, ltc3882, ltc3886 : vout0 - vout1 * ltc3883 : vout0 * ltm4676 : vout0 - vout1 + * ltm4686 : vout0 - vout1 Example: ltc2978@5e { diff --git a/Documentation/devicetree/bindings/i2c/i2c-designware.txt b/Documentation/devicetree/bindings/i2c/i2c-designware.txt index fbb0a6d8b964..3e4bcc2fb6f7 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-designware.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-designware.txt @@ -3,6 +3,7 @@ Required properties : - compatible : should be "snps,designware-i2c" + or "mscc,ocelot-i2c" with "snps,designware-i2c" for fallback - reg : Offset and length of the register set for the device - interrupts : where IRQ is the interrupt number. @@ -11,8 +12,12 @@ Recommended properties : - clock-frequency : desired I2C bus clock frequency in Hz. Optional properties : + - reg : for "mscc,ocelot-i2c", a second register set to configure the SDA hold + time, named ICPU_CFG:TWI_DELAY in the datasheet. + - i2c-sda-hold-time-ns : should contain the SDA hold time in nanoseconds. - This option is only supported in hardware blocks version 1.11a or newer. + This option is only supported in hardware blocks version 1.11a or newer and + on Microsemi SoCs ("mscc,ocelot-i2c" compatible). - i2c-scl-falling-time-ns : should contain the SCL falling time in nanoseconds. This value which is by default 300ns is used to compute the tLOW period. diff --git a/Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.txt b/Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.txt index 091c8dfd3229..b245363d6d60 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.txt @@ -3,6 +3,7 @@ Required properties: - compatible : - "fsl,imx7ulp-lpi2c" for LPI2C compatible with the one integrated on i.MX7ULP soc + - "fsl,imx8qxp-lpi2c" for LPI2C compatible with the one integrated on i.MX8QXP soc - reg : address and length of the lpi2c master registers - interrupts : lpi2c interrupt - clocks : lpi2c clock specifier diff --git a/Documentation/devicetree/bindings/i2c/i2c-omap.txt b/Documentation/devicetree/bindings/i2c/i2c-omap.txt index 7e49839d4124..4b90ba9f31b7 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-omap.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-omap.txt @@ -1,8 +1,12 @@ I2C for OMAP platforms Required properties : -- compatible : Must be "ti,omap2420-i2c", "ti,omap2430-i2c", "ti,omap3-i2c" - or "ti,omap4-i2c" +- compatible : Must be + "ti,omap2420-i2c" for OMAP2420 SoCs + "ti,omap2430-i2c" for OMAP2430 SoCs + "ti,omap3-i2c" for OMAP3 SoCs + "ti,omap4-i2c" for OMAP4+ SoCs + "ti,am654-i2c", "ti,omap4-i2c" for AM654 SoCs - ti,hwmods : Must be "i2c", n being the instance number (1-based) - #address-cells = <1>; - #size-cells = <0>; diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt index 39cd21d95810..30c0485b167b 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt @@ -3,7 +3,9 @@ I2C for R-Car platforms Required properties: - compatible: "renesas,i2c-r8a7743" if the device is a part of a R8A7743 SoC. + "renesas,i2c-r8a7744" if the device is a part of a R8A7744 SoC. "renesas,i2c-r8a7745" if the device is a part of a R8A7745 SoC. + "renesas,i2c-r8a77470" if the device is a part of a R8A77470 SoC. "renesas,i2c-r8a774a1" if the device is a part of a R8A774A1 SoC. "renesas,i2c-r8a7778" if the device is a part of a R8A7778 SoC. "renesas,i2c-r8a7779" if the device is a part of a R8A7779 SoC. diff --git a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt b/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt index 872673adff5a..d81b62643655 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt @@ -5,6 +5,7 @@ Required properties: - "renesas,iic-r8a73a4" (R-Mobile APE6) - "renesas,iic-r8a7740" (R-Mobile A1) - "renesas,iic-r8a7743" (RZ/G1M) + - "renesas,iic-r8a7744" (RZ/G1N) - "renesas,iic-r8a7745" (RZ/G1E) - "renesas,iic-r8a774a1" (RZ/G2M) - "renesas,iic-r8a7790" (R-Car H2) diff --git a/Documentation/devicetree/bindings/i2c/i2c.txt b/Documentation/devicetree/bindings/i2c/i2c.txt index 11263982470e..44efafdfd7f5 100644 --- a/Documentation/devicetree/bindings/i2c/i2c.txt +++ b/Documentation/devicetree/bindings/i2c/i2c.txt @@ -84,7 +84,7 @@ Binding may contain optional "interrupts" property, describing interrupts used by the device. I2C core will assign "irq" interrupt (or the very first interrupt if not using interrupt names) as primary interrupt for the slave. -Alternatively, devices supporting SMbus Host Notify, and connected to +Alternatively, devices supporting SMBus Host Notify, and connected to adapters that support this feature, may use "host-notify" property. I2C core will create a virtual interrupt for Host Notify and assign it as primary interrupt for the slave. diff --git a/Documentation/devicetree/bindings/iio/accel/adxl372.txt b/Documentation/devicetree/bindings/iio/accel/adxl372.txt new file mode 100644 index 000000000000..a289964756a7 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/accel/adxl372.txt @@ -0,0 +1,33 @@ +Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer + +http://www.analog.com/media/en/technical-documentation/data-sheets/adxl372.pdf + +Required properties: + - compatible : should be "adi,adxl372" + - reg: the I2C address or SPI chip select number for the device + +Required properties for SPI bus usage: + - spi-max-frequency: Max SPI frequency to use + +Optional properties: + - interrupts: interrupt mapping for IRQ as documented in + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt + +Example for a I2C device node: + + accelerometer@53 { + compatible = "adi,adxl372"; + reg = <0x53>; + interrupt-parent = <&gpio>; + interrupts = <25 IRQ_TYPE_EDGE_FALLING>; + }; + +Example for a SPI device node: + + accelerometer@0 { + compatible = "adi,adxl372"; + reg = <0>; + spi-max-frequency = <1000000>; + interrupt-parent = <&gpio>; + interrupts = <25 IRQ_TYPE_EDGE_FALLING>; + }; diff --git a/Documentation/devicetree/bindings/iio/adc/mcp3911.txt b/Documentation/devicetree/bindings/iio/adc/mcp3911.txt new file mode 100644 index 000000000000..3071f48fb30b --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/mcp3911.txt @@ -0,0 +1,30 @@ +* Microchip MCP3911 Dual channel analog front end (ADC) + +Required properties: + - compatible: Should be "microchip,mcp3911" + - reg: SPI chip select number for the device + +Recommended properties: + - spi-max-frequency: Definition as per + Documentation/devicetree/bindings/spi/spi-bus.txt. + Max frequency for this chip is 20MHz. + +Optional properties: + - clocks: Phandle and clock identifier for sampling clock + - interrupt-parent: Phandle to the parent interrupt controller + - interrupts: IRQ line for the ADC + - microchip,device-addr: Device address when multiple MCP3911 chips are present on the + same SPI bus. Valid values are 0-3. Defaults to 0. + - vref-supply: Phandle to the external reference voltage supply. + +Example: +adc@0 { + compatible = "microchip,mcp3911"; + reg = <0>; + interrupt-parent = <&gpio5>; + interrupts = <15 IRQ_TYPE_EDGE_RISING>; + spi-max-frequency = <20000000>; + microchip,device-addr = <0>; + vref-supply = <&vref_reg>; + clocks = <&xtal>; +}; diff --git a/Documentation/devicetree/bindings/iio/adc/qcom,spmi-vadc.txt b/Documentation/devicetree/bindings/iio/adc/qcom,spmi-vadc.txt index 0fb46137f936..b3c86f4ac7cd 100644 --- a/Documentation/devicetree/bindings/iio/adc/qcom,spmi-vadc.txt +++ b/Documentation/devicetree/bindings/iio/adc/qcom,spmi-vadc.txt @@ -1,7 +1,9 @@ -Qualcomm's SPMI PMIC voltage ADC +Qualcomm's SPMI PMIC ADC -SPMI PMIC voltage ADC (VADC) provides interface to clients to read -voltage. The VADC is a 15-bit sigma-delta ADC. +- SPMI PMIC voltage ADC (VADC) provides interface to clients to read + voltage. The VADC is a 15-bit sigma-delta ADC. +- SPMI PMIC5 voltage ADC (ADC) provides interface to clients to read + voltage. The VADC is a 16-bit sigma-delta ADC. VADC node: @@ -9,11 +11,13 @@ VADC node: Usage: required Value type: Definition: Should contain "qcom,spmi-vadc". + Should contain "qcom,spmi-adc5" for PMIC5 ADC driver. + Should contain "qcom,spmi-adc-rev2" for PMIC rev2 ADC driver. - reg: Usage: required Value type: - Definition: VADC base address and length in the SPMI PMIC register map. + Definition: VADC base address in the SPMI PMIC register map. - #address-cells: Usage: required @@ -45,13 +49,26 @@ Channel node properties: Definition: ADC channel number. See include/dt-bindings/iio/qcom,spmi-vadc.h +- label: + Usage: required for "qcom,spmi-adc5" and "qcom,spmi-adc-rev2" + Value type: + Definition: ADC input of the platform as seen in the schematics. + For thermistor inputs connected to generic AMUX or GPIO inputs + these can vary across platform for the same pins. Hence select + the platform schematics name for this channel. + - qcom,decimation: Usage: optional Value type: Definition: This parameter is used to decrease ADC sampling rate. Quicker measurements can be made by reducing decimation ratio. - Valid values are 512, 1024, 2048, 4096. - If property is not found, default value of 512 will be used. + - For compatible property "qcom,spmi-vadc", valid values are + 512, 1024, 2048, 4096. If property is not found, default value + of 512 will be used. + - For compatible property "qcom,spmi-adc5", valid values are 250, 420 + and 840. If property is not found, default value of 840 is used. + - For compatible property "qcom,spmi-adc-rev2", valid values are 256, + 512 and 1024. If property is not present, default value is 1024. - qcom,pre-scaling: Usage: optional @@ -66,21 +83,38 @@ Channel node properties: - qcom,ratiometric: Usage: optional Value type: - Definition: Channel calibration type. If this property is specified - VADC will use the VDD reference (1.8V) and GND for channel - calibration. If property is not found, channel will be - calibrated with 0.625V and 1.25V reference channels, also - known as absolute calibration. + Definition: Channel calibration type. + - For compatible property "qcom,spmi-vadc", if this property is + specified VADC will use the VDD reference (1.8V) and GND for + channel calibration. If property is not found, channel will be + calibrated with 0.625V and 1.25V reference channels, also + known as absolute calibration. + - For compatible property "qcom,spmi-adc5" and "qcom,spmi-adc-rev2", + if this property is specified VADC will use the VDD reference + (1.875V) and GND for channel calibration. If property is not found, + channel will be calibrated with 0V and 1.25V reference channels, + also known as absolute calibration. - qcom,hw-settle-time: Usage: optional Value type: Definition: Time between AMUX getting configured and the ADC starting - conversion. Delay = 100us * (value) for value < 11, and - 2ms * (value - 10) otherwise. - Valid values are: 0, 100, 200, 300, 400, 500, 600, 700, 800, - 900 us and 1, 2, 4, 6, 8, 10 ms - If property is not found, channel will use 0us. + conversion. The 'hw_settle_time' is an index used from valid values + and programmed in hardware to achieve the hardware settling delay. + - For compatible property "qcom,spmi-vadc" and "qcom,spmi-adc-rev2", + Delay = 100us * (hw_settle_time) for hw_settle_time < 11, + and 2ms * (hw_settle_time - 10) otherwise. + Valid values are: 0, 100, 200, 300, 400, 500, 600, 700, 800, + 900 us and 1, 2, 4, 6, 8, 10 ms. + If property is not found, channel will use 0us. + - For compatible property "qcom,spmi-adc5", delay = 15us for + value 0, 100us * (value) for values < 11, + and 2ms * (value - 10) otherwise. + Valid values are: 15, 100, 200, 300, 400, 500, 600, 700, 800, + 900 us and 1, 2, 4, 6, 8, 10 ms + Certain controller digital versions have valid values of + 15, 100, 200, 300, 400, 500, 600, 700, 1, 2, 4, 8, 16, 32, 64, 128 ms + If property is not found, channel will use 15us. - qcom,avg-samples: Usage: optional @@ -89,13 +123,18 @@ Channel node properties: Averaging provides the option to obtain a single measurement from the ADC that is an average of multiple samples. The value selected is 2^(value). - Valid values are: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 - If property is not found, 1 sample will be used. + - For compatible property "qcom,spmi-vadc", valid values + are: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 + If property is not found, 1 sample will be used. + - For compatible property "qcom,spmi-adc5" and "qcom,spmi-adc-rev2", + valid values are: 1, 2, 4, 8, 16 + If property is not found, 1 sample will be used. NOTE: -Following channels, also known as reference point channels, are used for -result calibration and their channel configuration nodes should be defined: +For compatible property "qcom,spmi-vadc" following channels, also known as +reference point channels, are used for result calibration and their channel +configuration nodes should be defined: VADC_REF_625MV and/or VADC_SPARE1(based on PMIC version) VADC_REF_1250MV, VADC_GND_REF and VADC_VDD_VADC. @@ -104,7 +143,7 @@ Example: /* VADC node */ pmic_vadc: vadc@3100 { compatible = "qcom,spmi-vadc"; - reg = <0x3100 0x100>; + reg = <0x3100>; interrupts = <0x0 0x31 0x0 IRQ_TYPE_EDGE_RISING>; #address-cells = <1>; #size-cells = <0>; diff --git a/Documentation/devicetree/bindings/iio/adc/sprd,sc27xx-adc.txt b/Documentation/devicetree/bindings/iio/adc/sprd,sc27xx-adc.txt index 8aad960de50b..b4daa15dcf15 100644 --- a/Documentation/devicetree/bindings/iio/adc/sprd,sc27xx-adc.txt +++ b/Documentation/devicetree/bindings/iio/adc/sprd,sc27xx-adc.txt @@ -12,6 +12,8 @@ Required properties: - interrupts: The interrupt number for the ADC device. - #io-channel-cells: Number of cells in an IIO specifier. - hwlocks: Reference to a phandle of a hwlock provider node. +- nvmem-cells: A phandle to the calibration cells provided by eFuse device. +- nvmem-cell-names: Should be "big_scale_calib", "small_scale_calib". Example: @@ -32,5 +34,7 @@ Example: interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; #io-channel-cells = <1>; hwlocks = <&hwlock 4>; + nvmem-cells = <&adc_big_scale>, <&adc_small_scale>; + nvmem-cell-names = "big_scale_calib", "small_scale_calib"; }; }; diff --git a/Documentation/devicetree/bindings/iio/dac/ad5758.txt b/Documentation/devicetree/bindings/iio/dac/ad5758.txt index bba01a5cab1b..2f607f41f9d3 100644 --- a/Documentation/devicetree/bindings/iio/dac/ad5758.txt +++ b/Documentation/devicetree/bindings/iio/dac/ad5758.txt @@ -50,6 +50,9 @@ Required properties: Optional properties: + - reset-gpios : GPIO spec for the RESET pin. If specified, it will be + asserted during driver probe. + - adi,dc-dc-ilim-microamp: The dc-to-dc converter current limit The following values are currently supported [uA]: * 150000 @@ -71,6 +74,8 @@ AD5758 Example: spi-max-frequency = <1000000>; spi-cpha; + reset-gpios = <&gpio 22 0>; + adi,dc-dc-mode = <2>; adi,range-microvolt = <0 10000000>; adi,dc-dc-ilim-microamp = <200000>; diff --git a/Documentation/devicetree/bindings/iio/dac/ltc1660.txt b/Documentation/devicetree/bindings/iio/dac/ltc1660.txt new file mode 100644 index 000000000000..c5b5f22d6c64 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/dac/ltc1660.txt @@ -0,0 +1,21 @@ +* Linear Technology Micropower octal 8-Bit and 10-Bit DACs + +Required properties: + - compatible: Must be one of the following: + "lltc,ltc1660" + "lltc,ltc1665" + - reg: SPI chip select number for the device + - vref-supply: Phandle to the voltage reference supply + +Recommended properties: + - spi-max-frequency: Definition as per + Documentation/devicetree/bindings/spi/spi-bus.txt. + Max frequency for this chip is 5 MHz. + +Example: +dac@0 { + compatible = "lltc,ltc1660"; + reg = <0>; + spi-max-frequency = <5000000>; + vref-supply = <&vref_reg>; +}; diff --git a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt index b2f27da847b8..6ab9a9d196b0 100644 --- a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt +++ b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt @@ -20,6 +20,7 @@ Required properties: bindings. Optional properties: + - vddio-supply: regulator phandle for VDDIO supply - mount-matrix: an optional 3x3 mounting rotation matrix - i2c-gate node. These devices also support an auxiliary i2c bus. This is simple enough to be described using the i2c-gate binding. See diff --git a/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt b/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt index ea2d6e0ae4c5..879322ad50fd 100644 --- a/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt +++ b/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt @@ -7,6 +7,7 @@ Required properties: "st,lsm6dsl" "st,lsm6dsm" "st,ism330dlc" + "st,lsm6dso" - reg: i2c address of the sensor / spi cs line Optional properties: diff --git a/Documentation/devicetree/bindings/iio/light/bh1750.txt b/Documentation/devicetree/bindings/iio/light/bh1750.txt new file mode 100644 index 000000000000..1e7685797d7a --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/bh1750.txt @@ -0,0 +1,18 @@ +ROHM BH1750 - ALS, Ambient light sensor + +Required properties: + +- compatible: Must be one of: + "rohm,bh1710" + "rohm,bh1715" + "rohm,bh1721" + "rohm,bh1750" + "rohm,bh1751" +- reg: the I2C address of the sensor + +Example: + +light-sensor@23 { + compatible = "rohm,bh1750"; + reg = <0x23>; +}; diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.txt b/Documentation/devicetree/bindings/iio/light/tsl2772.txt new file mode 100644 index 000000000000..1c5e6f17a1df --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/tsl2772.txt @@ -0,0 +1,42 @@ +* AMS/TAOS ALS and proximity sensor + +Required properties: + + - compatible: Should be one of + "amstaos,tsl2571" + "amstaos,tsl2671" + "amstaos,tmd2671" + "amstaos,tsl2771" + "amstaos,tmd2771" + "amstaos,tsl2572" + "amstaos,tsl2672" + "amstaos,tmd2672" + "amstaos,tsl2772" + "amstaos,tmd2772" + "avago,apds9930" + - reg: the I2C address of the device + +Optional properties: + + - amstaos,proximity-diodes - proximity diodes to enable. <0>, <1>, or <0 1> + are the only valid values. + - led-max-microamp - current for the proximity LED. Must be 100000, 50000, + 25000, or 13000. + - vdd-supply: phandle to the regulator that provides power to the sensor. + - vddio-supply: phandle to the regulator that provides power to the bus. + - interrupts: the sole interrupt generated by the device + + Refer to interrupt-controller/interrupts.txt for generic interrupt client + node bindings. + +Example: + +tsl2772@39 { + compatible = "amstaos,tsl2772"; + reg = <0x39>; + interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>; + vdd-supply = <&pm8941_l17>; + vddio-supply = <&pm8941_lvs1>; + amstaos,proximity-diodes = <0>; + led-max-microamp = <100000>; +}; diff --git a/Documentation/devicetree/bindings/iio/proximity/vl53l0x.txt b/Documentation/devicetree/bindings/iio/proximity/vl53l0x.txt new file mode 100644 index 000000000000..aac5f621f8dc --- /dev/null +++ b/Documentation/devicetree/bindings/iio/proximity/vl53l0x.txt @@ -0,0 +1,12 @@ +ST VL53L0X ToF ranging sensor + +Required properties: + - compatible: must be "st,vl53l0x" + - reg: i2c address where to find the device + +Example: + +vl53l0x@29 { + compatible = "st,vl53l0x"; + reg = <0x29>; +}; diff --git a/Documentation/devicetree/bindings/input/pwm-vibrator.txt b/Documentation/devicetree/bindings/input/pwm-vibrator.txt index 09145d18491d..88c775a3fe21 100644 --- a/Documentation/devicetree/bindings/input/pwm-vibrator.txt +++ b/Documentation/devicetree/bindings/input/pwm-vibrator.txt @@ -58,8 +58,8 @@ Example from Motorola Droid 4: vibrator { compatible = "pwm-vibrator"; - pwms = <&pwm8 0 1000000000 0>, - <&pwm9 0 1000000000 0>; + pwms = <&pwm9 0 1000000000 0>, + <&pwm8 0 1000000000 0>; pwm-names = "enable", "direction"; direction-duty-cycle-ns = <1000000000>; }; diff --git a/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt b/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt index d092d5d033a0..8641a2d70851 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt @@ -1,10 +1,12 @@ General Touchscreen Properties: Optional properties for Touchscreens: + - touchscreen-min-x : minimum x coordinate reported (0 if not set) + - touchscreen-min-y : minimum y coordinate reported (0 if not set) - touchscreen-size-x : horizontal resolution of touchscreen - (in pixels) + (maximum x coordinate reported + 1) - touchscreen-size-y : vertical resolution of touchscreen - (in pixels) + (maximum y coordinate reported + 1) - touchscreen-max-pressure : maximum reported pressure (arbitrary range dependent on the controller) - touchscreen-min-pressure : minimum pressure on the touchscreen to be diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,apb-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,apb-intc.txt new file mode 100644 index 000000000000..44286dcbac62 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/csky,apb-intc.txt @@ -0,0 +1,62 @@ +============================== +C-SKY APB Interrupt Controller +============================== + +C-SKY APB Interrupt Controller is a simple soc interrupt controller +on the apb bus and we only use it as root irq controller. + + - csky,apb-intc is used in a lot of csky fpgas and socs, it support 64 irq nums. + - csky,dual-apb-intc consists of 2 apb-intc and 128 irq nums supported. + - csky,gx6605s-intc is gx6605s soc internal irq interrupt controller, 64 irq nums. + +============================= +intc node bindings definition +============================= + + Description: Describes APB interrupt controller + + PROPERTIES + + - compatible + Usage: required + Value type: + Definition: must be "csky,apb-intc" + "csky,dual-apb-intc" + "csky,gx6605s-intc" + - #interrupt-cells + Usage: required + Value type: + Definition: must be <1> + - reg + Usage: required + Value type: + Definition: in soc from cpu view + - interrupt-controller: + Usage: required + - csky,support-pulse-signal: + Usage: select + Description: to support pulse signal flag + +Examples: +--------- + + intc: interrupt-controller@500000 { + compatible = "csky,apb-intc"; + #interrupt-cells = <1>; + reg = <0x00500000 0x400>; + interrupt-controller; + }; + + intc: interrupt-controller@500000 { + compatible = "csky,dual-apb-intc"; + #interrupt-cells = <1>; + reg = <0x00500000 0x400>; + interrupt-controller; + }; + + intc: interrupt-controller@500000 { + compatible = "csky,gx6605s-intc"; + #interrupt-cells = <1>; + reg = <0x00500000 0x400>; + interrupt-controller; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt new file mode 100644 index 000000000000..ab921f1698fb --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt @@ -0,0 +1,40 @@ +=========================================== +C-SKY Multi-processors Interrupt Controller +=========================================== + +C-SKY Multi-processors Interrupt Controller is designed for ck807/ck810/ck860 +SMP soc, and it also could be used in non-SMP system. + +Interrupt number definition: + + 0-15 : software irq, and we use 15 as our IPI_IRQ. + 16-31 : private irq, and we use 16 as the co-processor timer. + 31-1024: common irq for soc ip. + +============================= +intc node bindings definition +============================= + + Description: Describes SMP interrupt controller + + PROPERTIES + + - compatible + Usage: required + Value type: + Definition: must be "csky,mpintc" + - #interrupt-cells + Usage: required + Value type: + Definition: must be <1> + - interrupt-controller: + Usage: required + +Examples: +--------- + + intc: interrupt-controller { + compatible = "csky,mpintc"; + #interrupt-cells = <1>; + interrupt-controller; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,icu.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,icu.txt index aa8bf2ec8905..1c94a57a661e 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/marvell,icu.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,icu.txt @@ -5,6 +5,8 @@ The Marvell ICU (Interrupt Consolidation Unit) controller is responsible for collecting all wired-interrupt sources in the CP and communicating them to the GIC in the AP, the unit translates interrupt requests on input wires to MSG memory mapped transactions to the GIC. +These messages will access a different GIC memory area depending on +their type (NSR, SR, SEI, REI, etc). Required properties: @@ -12,20 +14,23 @@ Required properties: - reg: Should contain ICU registers location and length. -- #interrupt-cells: Specifies the number of cells needed to encode an - interrupt source. The value shall be 3. +Subnodes: Each group of interrupt is declared as a subnode of the ICU, +with their own compatible. + +Required properties for the icu_nsr/icu_sei subnodes: - The 1st cell is the group type of the ICU interrupt. Possible group - types are: +- compatible: Should be one of: + * "marvell,cp110-icu-nsr" + * "marvell,cp110-icu-sr" + * "marvell,cp110-icu-sei" + * "marvell,cp110-icu-rei" - ICU_GRP_NSR (0x0) : Shared peripheral interrupt, non-secure - ICU_GRP_SR (0x1) : Shared peripheral interrupt, secure - ICU_GRP_SEI (0x4) : System error interrupt - ICU_GRP_REI (0x5) : RAM error interrupt +- #interrupt-cells: Specifies the number of cells needed to encode an + interrupt source. The value shall be 2. - The 2nd cell is the index of the interrupt in the ICU unit. + The 1st cell is the index of the interrupt in the ICU unit. - The 3rd cell is the type of the interrupt. See arm,gic.txt for + The 2nd cell is the type of the interrupt. See arm,gic.txt for details. - interrupt-controller: Identifies the node as an interrupt @@ -35,17 +40,73 @@ Required properties: that allows to trigger interrupts using MSG memory mapped transactions. +Note: each 'interrupts' property referring to any 'icu_xxx' node shall + have a different number within [0:206]. + Example: icu: interrupt-controller@1e0000 { compatible = "marvell,cp110-icu"; - reg = <0x1e0000 0x10>; + reg = <0x1e0000 0x440>; + + CP110_LABEL(icu_nsr): interrupt-controller@10 { + compatible = "marvell,cp110-icu-nsr"; + reg = <0x10 0x20>; + #interrupt-cells = <2>; + interrupt-controller; + msi-parent = <&gicp>; + }; + + CP110_LABEL(icu_sei): interrupt-controller@50 { + compatible = "marvell,cp110-icu-sei"; + reg = <0x50 0x10>; + #interrupt-cells = <2>; + interrupt-controller; + msi-parent = <&sei>; + }; +}; + +node1 { + interrupt-parent = <&icu_nsr>; + interrupts = <106 IRQ_TYPE_LEVEL_HIGH>; +}; + +node2 { + interrupt-parent = <&icu_sei>; + interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; +}; + +/* Would not work with the above nodes */ +node3 { + interrupt-parent = <&icu_nsr>; + interrupts = <107 IRQ_TYPE_LEVEL_HIGH>; +}; + +The legacy bindings were different in this way: + +- #interrupt-cells: The value was 3. + The 1st cell was the group type of the ICU interrupt. Possible + group types were: + ICU_GRP_NSR (0x0) : Shared peripheral interrupt, non-secure + ICU_GRP_SR (0x1) : Shared peripheral interrupt, secure + ICU_GRP_SEI (0x4) : System error interrupt + ICU_GRP_REI (0x5) : RAM error interrupt + The 2nd cell was the index of the interrupt in the ICU unit. + The 3rd cell was the type of the interrupt. See arm,gic.txt for + details. + +Example: + +icu: interrupt-controller@1e0000 { + compatible = "marvell,cp110-icu"; + reg = <0x1e0000 0x440>; + #interrupt-cells = <3>; interrupt-controller; msi-parent = <&gicp>; }; -usb3h0: usb3@500000 { +node1 { interrupt-parent = <&icu>; interrupts = ; }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,sei.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,sei.txt new file mode 100644 index 000000000000..0beafed502f5 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,sei.txt @@ -0,0 +1,36 @@ +Marvell SEI (System Error Interrupt) Controller +----------------------------------------------- + +Marvell SEI (System Error Interrupt) controller is an interrupt +aggregator. It receives interrupts from several sources and aggregates +them to a single interrupt line (an SPI) on the parent interrupt +controller. + +This interrupt controller can handle up to 64 SEIs, a set comes from the +AP and is wired while a second set comes from the CPs by the mean of +MSIs. + +Required properties: + +- compatible: should be one of: + * "marvell,ap806-sei" +- reg: SEI registers location and length. +- interrupts: identifies the parent IRQ that will be triggered. +- #interrupt-cells: number of cells to define an SEI wired interrupt + coming from the AP, should be 1. The cell is the IRQ + number. +- interrupt-controller: identifies the node as an interrupt controller + for AP interrupts. +- msi-controller: identifies the node as an MSI controller for the CPs + interrupts. + +Example: + + sei: interrupt-controller@3f0200 { + compatible = "marvell,ap806-sei"; + reg = <0x3f0200 0x40>; + interrupts = ; + #interrupt-cells = <1>; + interrupt-controller; + msi-controller; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt index a046ed374d80..8de96a4fb2d5 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt @@ -2,10 +2,12 @@ DT bindings for the R-Mobile/R-Car/RZ/G interrupt controller Required properties: -- compatible: has to be "renesas,irqc-", "renesas,irqc" as fallback. +- compatible: must be "renesas,irqc-" or "renesas,intc-ex-", + and "renesas,irqc" as fallback. Examples with soctypes are: - "renesas,irqc-r8a73a4" (R-Mobile APE6) - "renesas,irqc-r8a7743" (RZ/G1M) + - "renesas,irqc-r8a7744" (RZ/G1N) - "renesas,irqc-r8a7745" (RZ/G1E) - "renesas,irqc-r8a77470" (RZ/G1C) - "renesas,irqc-r8a7790" (R-Car H2) @@ -19,6 +21,7 @@ Required properties: - "renesas,intc-ex-r8a77965" (R-Car M3-N) - "renesas,intc-ex-r8a77970" (R-Car V3M) - "renesas,intc-ex-r8a77980" (R-Car V3H) + - "renesas,intc-ex-r8a77990" (R-Car E3) - "renesas,intc-ex-r8a77995" (R-Car D3) - #interrupt-cells: has to be <2>: an interrupt index and flags, as defined in interrupts.txt in this directory diff --git a/Documentation/devicetree/bindings/iommu/mediatek,iommu.txt b/Documentation/devicetree/bindings/iommu/mediatek,iommu.txt index df5db732138d..6922db598def 100644 --- a/Documentation/devicetree/bindings/iommu/mediatek,iommu.txt +++ b/Documentation/devicetree/bindings/iommu/mediatek,iommu.txt @@ -41,6 +41,8 @@ Required properties: - compatible : must be one of the following string: "mediatek,mt2701-m4u" for mt2701 which uses generation one m4u HW. "mediatek,mt2712-m4u" for mt2712 which uses generation two m4u HW. + "mediatek,mt7623-m4u", "mediatek,mt2701-m4u" for mt7623 which uses + generation one m4u HW. "mediatek,mt8173-m4u" for mt8173 which uses generation two m4u HW. - reg : m4u register base and size. - interrupts : the interrupt of m4u. @@ -51,7 +53,7 @@ Required properties: according to the local arbiter index, like larb0, larb1, larb2... - iommu-cells : must be 1. This is the mtk_m4u_id according to the HW. Specifies the mtk_m4u_id as defined in - dt-binding/memory/mt2701-larb-port.h for mt2701, + dt-binding/memory/mt2701-larb-port.h for mt2701, mt7623 dt-binding/memory/mt2712-larb-port.h for mt2712, and dt-binding/memory/mt8173-larb-port.h for mt8173. diff --git a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt index c6e2d855fe13..377ee639d103 100644 --- a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt +++ b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt @@ -12,6 +12,7 @@ Required Properties: - "renesas,ipmmu-r8a73a4" for the R8A73A4 (R-Mobile APE6) IPMMU. - "renesas,ipmmu-r8a7743" for the R8A7743 (RZ/G1M) IPMMU. + - "renesas,ipmmu-r8a7744" for the R8A7744 (RZ/G1N) IPMMU. - "renesas,ipmmu-r8a7745" for the R8A7745 (RZ/G1E) IPMMU. - "renesas,ipmmu-r8a7790" for the R8A7790 (R-Car H2) IPMMU. - "renesas,ipmmu-r8a7791" for the R8A7791 (R-Car M2-W) IPMMU. diff --git a/Documentation/devicetree/bindings/leds/leds-an30259a.txt b/Documentation/devicetree/bindings/leds/leds-an30259a.txt new file mode 100644 index 000000000000..6ffb861083c0 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-an30259a.txt @@ -0,0 +1,43 @@ +* Panasonic AN30259A 3-channel LED driver + +The AN30259A is a LED controller capable of driving three LEDs independently. It supports +constant current output and sloping current output modes. The chip is connected over I2C. + +Required properties: + - compatible: Must be "panasonic,an30259a". + - reg: I2C slave address. + - #address-cells: Must be 1. + - #size-cells: Must be 0. + +Each LED is represented as a sub-node of the panasonic,an30259a node. + +Required sub-node properties: + - reg: Pin that the LED is connected to. Must be 1, 2, or 3. + +Optional sub-node properties: + - label: see Documentation/devicetree/bindings/leds/common.txt + - linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt + +Example: +led-controller@30 { + compatible = "panasonic,an30259a"; + reg = <0x30>; + #address-cells = <1>; + #size-cells = <0>; + + led@1 { + reg = <1>; + linux,default-trigger = "heartbeat"; + label = "red:indicator"; + }; + + led@2 { + reg = <2>; + label = "green:indicator"; + }; + + led@3 { + reg = <3>; + label = "blue:indicator"; + }; +}; diff --git a/Documentation/devicetree/bindings/mailbox/qcom,apcs-kpss-global.txt b/Documentation/devicetree/bindings/mailbox/qcom,apcs-kpss-global.txt index 6e8a9ab0fdae..1232fc9fc709 100644 --- a/Documentation/devicetree/bindings/mailbox/qcom,apcs-kpss-global.txt +++ b/Documentation/devicetree/bindings/mailbox/qcom,apcs-kpss-global.txt @@ -11,6 +11,7 @@ platforms. "qcom,msm8916-apcs-kpss-global", "qcom,msm8996-apcs-hmss-global" "qcom,msm8998-apcs-hmss-global" + "qcom,qcs404-apcs-apps-global" "qcom,sdm845-apss-shared" - reg: diff --git a/Documentation/devicetree/bindings/media/cedrus.txt b/Documentation/devicetree/bindings/media/cedrus.txt new file mode 100644 index 000000000000..a089a0c1ff05 --- /dev/null +++ b/Documentation/devicetree/bindings/media/cedrus.txt @@ -0,0 +1,54 @@ +Device-tree bindings for the VPU found in Allwinner SoCs, referred to as the +Video Engine (VE) in Allwinner literature. + +The VPU can only access the first 256 MiB of DRAM, that are DMA-mapped starting +from the DRAM base. This requires specific memory allocation and handling. + +Required properties: +- compatible : must be one of the following compatibles: + - "allwinner,sun4i-a10-video-engine" + - "allwinner,sun5i-a13-video-engine" + - "allwinner,sun7i-a20-video-engine" + - "allwinner,sun8i-a33-video-engine" + - "allwinner,sun8i-h3-video-engine" +- reg : register base and length of VE; +- clocks : list of clock specifiers, corresponding to entries in + the clock-names property; +- clock-names : should contain "ahb", "mod" and "ram" entries; +- resets : phandle for reset; +- interrupts : VE interrupt number; +- allwinner,sram : SRAM region to use with the VE. + +Optional properties: +- memory-region : CMA pool to use for buffers allocation instead of the + default CMA pool. + +Example: + +reserved-memory { + #address-cells = <1>; + #size-cells = <1>; + ranges; + + /* Address must be kept in the lower 256 MiBs of DRAM for VE. */ + cma_pool: cma@4a000000 { + compatible = "shared-dma-pool"; + size = <0x6000000>; + alloc-ranges = <0x4a000000 0x6000000>; + reusable; + linux,cma-default; + }; +}; + +video-codec@1c0e000 { + compatible = "allwinner,sun7i-a20-video-engine"; + reg = <0x01c0e000 0x1000>; + + clocks = <&ccu CLK_AHB_VE>, <&ccu CLK_VE>, + <&ccu CLK_DRAM_VE>; + clock-names = "ahb", "mod", "ram"; + + resets = <&ccu RST_VE>; + interrupts = ; + allwinner,sram = <&ve_sram 1>; +}; diff --git a/Documentation/devicetree/bindings/media/fsl-pxp.txt b/Documentation/devicetree/bindings/media/fsl-pxp.txt new file mode 100644 index 000000000000..2477e7f87381 --- /dev/null +++ b/Documentation/devicetree/bindings/media/fsl-pxp.txt @@ -0,0 +1,26 @@ +Freescale Pixel Pipeline +======================== + +The Pixel Pipeline (PXP) is a memory-to-memory graphics processing engine +that supports scaling, colorspace conversion, alpha blending, rotation, and +pixel conversion via lookup table. Different versions are present on various +i.MX SoCs from i.MX23 to i.MX7. + +Required properties: +- compatible: should be "fsl,-pxp", where SoC can be one of imx23, imx28, + imx6dl, imx6sl, imx6ul, imx6sx, imx6ull, or imx7d. +- reg: the register base and size for the device registers +- interrupts: the PXP interrupt, two interrupts for imx6ull and imx7d. +- clock-names: should be "axi" +- clocks: the PXP AXI clock + +Example: + +pxp@21cc000 { + compatible = "fsl,imx6ull-pxp"; + reg = <0x021cc000 0x4000>; + interrupts = , + ; + clock-names = "axi"; + clocks = <&clks IMX6UL_CLK_PXP>; +}; diff --git a/Documentation/devicetree/bindings/media/i2c/adv748x.txt b/Documentation/devicetree/bindings/media/i2c/adv748x.txt index 21ffb5ed8183..5dddc95f9cc4 100644 --- a/Documentation/devicetree/bindings/media/i2c/adv748x.txt +++ b/Documentation/devicetree/bindings/media/i2c/adv748x.txt @@ -10,7 +10,11 @@ Required Properties: - "adi,adv7481" for the ADV7481 - "adi,adv7482" for the ADV7482 - - reg: I2C slave address + - reg: I2C slave addresses + The ADV748x has up to twelve 256-byte maps that can be accessed via the + main I2C ports. Each map has it own I2C address and acts as a standard + slave device on the I2C bus. The main address is mandatory, others are + optional and remain at default values if not specified. Optional Properties: @@ -18,6 +22,11 @@ Optional Properties: "intrq3". All interrupts are optional. The "intrq3" interrupt is only available on the adv7481 - interrupts: Specify the interrupt lines for the ADV748x + - reg-names : Names of maps with programmable addresses. + It shall contain all maps needing a non-default address. + Possible map names are: + "main", "dpll", "cp", "hdmi", "edid", "repeater", + "infoframe", "cbus", "cec", "sdp", "txa", "txb" The device node must contain one 'port' child node per device input and output port, in accordance with the video interface bindings defined in @@ -47,7 +56,10 @@ Example: video-receiver@70 { compatible = "adi,adv7482"; - reg = <0x70>; + reg = <0x70 0x71 0x72 0x73 0x74 0x75 + 0x60 0x61 0x62 0x63 0x64 0x65>; + reg-names = "main", "dpll", "cp", "hdmi", "edid", "repeater", + "infoframe", "cbus", "cec", "sdp", "txa", "txb"; #address-cells = <1>; #size-cells = <0>; @@ -73,7 +85,7 @@ Example: }; }; - port@10 { + port@a { reg = <10>; adv7482_txa: endpoint { @@ -83,7 +95,7 @@ Example: }; }; - port@11 { + port@b { reg = <11>; adv7482_txb: endpoint { diff --git a/Documentation/devicetree/bindings/media/i2c/adv7604.txt b/Documentation/devicetree/bindings/media/i2c/adv7604.txt index dcf57e7c60eb..b3e688b77a38 100644 --- a/Documentation/devicetree/bindings/media/i2c/adv7604.txt +++ b/Documentation/devicetree/bindings/media/i2c/adv7604.txt @@ -66,7 +66,7 @@ Example: * other maps will retain their default addresses. */ reg = <0x4c>, <0x66>; - reg-names "main", "edid"; + reg-names = "main", "edid"; reset-gpios = <&ioexp 0 GPIO_ACTIVE_LOW>; hpd-gpios = <&ioexp 2 GPIO_ACTIVE_HIGH>; diff --git a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807.txt b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt similarity index 100% rename from Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807.txt rename to Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt diff --git a/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt b/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt index 3813947b4d4f..044b11913c49 100644 --- a/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt +++ b/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt @@ -5,6 +5,7 @@ Mediatek JPEG Decoder is the JPEG decode hardware present in Mediatek SoCs Required properties: - compatible : must be one of the following string: "mediatek,mt8173-jpgdec" + "mediatek,mt7623-jpgdec", "mediatek,mt2701-jpgdec" "mediatek,mt2701-jpgdec" - reg : physical base address of the jpeg decoder registers and length of memory mapped region. diff --git a/Documentation/devicetree/bindings/media/rcar_vin.txt b/Documentation/devicetree/bindings/media/rcar_vin.txt index 2f420050d57f..d329a4e8ac58 100644 --- a/Documentation/devicetree/bindings/media/rcar_vin.txt +++ b/Documentation/devicetree/bindings/media/rcar_vin.txt @@ -11,6 +11,7 @@ on Gen3 platforms to a CSI-2 receiver. - compatible: Must be one or more of the following - "renesas,vin-r8a7743" for the R8A7743 device + - "renesas,vin-r8a7744" for the R8A7744 device - "renesas,vin-r8a7745" for the R8A7745 device - "renesas,vin-r8a7778" for the R8A7778 device - "renesas,vin-r8a7779" for the R8A7779 device diff --git a/Documentation/devicetree/bindings/media/renesas,ceu.txt b/Documentation/devicetree/bindings/media/renesas,ceu.txt index 8a7a616e9019..3e2a2652eb19 100644 --- a/Documentation/devicetree/bindings/media/renesas,ceu.txt +++ b/Documentation/devicetree/bindings/media/renesas,ceu.txt @@ -17,15 +17,19 @@ Required properties: The CEU supports a single parallel input and should contain a single 'port' subnode with a single 'endpoint'. Connection to input devices are modeled according to the video interfaces OF bindings specified in: -Documentation/devicetree/bindings/media/video-interfaces.txt +[1] Documentation/devicetree/bindings/media/video-interfaces.txt Optional endpoint properties applicable to parallel input bus described in the above mentioned "video-interfaces.txt" file are supported. -- hsync-active: Active state of the HSYNC signal, 0/1 for LOW/HIGH respectively. - If property is not present, default is active high. -- vsync-active: Active state of the VSYNC signal, 0/1 for LOW/HIGH respectively. - If property is not present, default is active high. +- hsync-active: See [1] for description. If property is not present, + default is active high. +- vsync-active: See [1] for description. If property is not present, + default is active high. +- bus-width: See [1] for description. Accepted values are '8' and '16'. + If property is not present, default is '8'. +- field-even-active: See [1] for description. If property is not present, + an even field is identified by a logic 0 (active-low signal). Example: diff --git a/Documentation/devicetree/bindings/media/rockchip-vpu.txt b/Documentation/devicetree/bindings/media/rockchip-vpu.txt new file mode 100644 index 000000000000..35dc464ad7c8 --- /dev/null +++ b/Documentation/devicetree/bindings/media/rockchip-vpu.txt @@ -0,0 +1,29 @@ +device-tree bindings for rockchip VPU codec + +Rockchip (Video Processing Unit) present in various Rockchip platforms, +such as RK3288 and RK3399. + +Required properties: +- compatible: value should be one of the following + "rockchip,rk3288-vpu"; + "rockchip,rk3399-vpu"; +- interrupts: encoding and decoding interrupt specifiers +- interrupt-names: should be "vepu" and "vdpu" +- clocks: phandle to VPU aclk, hclk clocks +- clock-names: should be "aclk" and "hclk" +- power-domains: phandle to power domain node +- iommus: phandle to a iommu node + +Example: +SoC-specific DT entry: + vpu: video-codec@ff9a0000 { + compatible = "rockchip,rk3288-vpu"; + reg = <0x0 0xff9a0000 0x0 0x800>; + interrupts = , + ; + interrupt-names = "vepu", "vdpu"; + clocks = <&cru ACLK_VCODEC>, <&cru HCLK_VCODEC>; + clock-names = "aclk", "hclk"; + power-domains = <&power RK3288_PD_VIDEO>; + iommus = <&vpu_mmu>; + }; diff --git a/Documentation/devicetree/bindings/media/video-interfaces.txt b/Documentation/devicetree/bindings/media/video-interfaces.txt index baf9d9756b3c..f884ada0bffc 100644 --- a/Documentation/devicetree/bindings/media/video-interfaces.txt +++ b/Documentation/devicetree/bindings/media/video-interfaces.txt @@ -100,10 +100,12 @@ Optional endpoint properties slave device (data source) by the master device (data sink). In the master mode the data source device is also the source of the synchronization signals. - bus-type: data bus type. Possible values are: - 0 - autodetect based on other properties (MIPI CSI-2 D-PHY, parallel or Bt656) 1 - MIPI CSI-2 C-PHY 2 - MIPI CSI1 3 - CCP2 + 4 - MIPI CSI-2 D-PHY + 5 - Parallel + 6 - Bt.656 - bus-width: number of data lines actively used, valid for the parallel busses. - data-shift: on the parallel data busses, if bus-width is used to specify the number of data lines, data-shift can be used to specify which data lines are diff --git a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt index 615abdd0eb0d..e937ddd871a6 100644 --- a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt +++ b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt @@ -17,6 +17,7 @@ Required properties: - compatible : must be one of : "mediatek,mt2701-smi-common" "mediatek,mt2712-smi-common" + "mediatek,mt7623-smi-common", "mediatek,mt2701-smi-common" "mediatek,mt8173-smi-common" - reg : the register and size of the SMI block. - power-domains : a phandle to the power domain of this local arbiter. diff --git a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt index 083155cdc2a0..94eddcae77ab 100644 --- a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt +++ b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt @@ -6,6 +6,7 @@ Required properties: - compatible : must be one of : "mediatek,mt2701-smi-larb" "mediatek,mt2712-smi-larb" + "mediatek,mt7623-smi-larb", "mediatek,mt2701-smi-larb" "mediatek,mt8173-smi-larb" - reg : the register and size of this local arbiter. - mediatek,smi : a phandle to the smi_common node. @@ -16,7 +17,7 @@ Required properties: the register. - "smi" : It's the clock for transfer data and command. -Required property for mt2701 and mt2712: +Required property for mt2701, mt2712 and mt7623: - mediatek,larb-id :the hardware id of this larb. Example: diff --git a/Documentation/devicetree/bindings/mfd/arizona.txt b/Documentation/devicetree/bindings/mfd/arizona.txt index 9b62831fdf3e..148ef621a5e5 100644 --- a/Documentation/devicetree/bindings/mfd/arizona.txt +++ b/Documentation/devicetree/bindings/mfd/arizona.txt @@ -76,7 +76,7 @@ Deprecated properties: Also see child specific device properties: Regulator - ../regulator/arizona-regulator.txt Extcon - ../extcon/extcon-arizona.txt - Sound - ../sound/arizona.txt + Sound - ../sound/wlf,arizona.txt Example: diff --git a/Documentation/devicetree/bindings/serial/atmel-usart.txt b/Documentation/devicetree/bindings/mfd/atmel-usart.txt similarity index 76% rename from Documentation/devicetree/bindings/serial/atmel-usart.txt rename to Documentation/devicetree/bindings/mfd/atmel-usart.txt index 7c0d6b2f53e4..7f0cd72f47d2 100644 --- a/Documentation/devicetree/bindings/serial/atmel-usart.txt +++ b/Documentation/devicetree/bindings/mfd/atmel-usart.txt @@ -1,6 +1,6 @@ * Atmel Universal Synchronous Asynchronous Receiver/Transmitter (USART) -Required properties: +Required properties for USART: - compatible: Should be "atmel,-usart" or "atmel,-dbgu" The compatible indicated will be the first SoC to support an additional mode or an USART new feature. @@ -11,7 +11,13 @@ Required properties: Required elements: "usart" - clocks: phandles to input clocks. -Optional properties: +Required properties for USART in SPI mode: +- #size-cells : Must be <0> +- #address-cells : Must be <1> +- cs-gpios: chipselects (internal cs not supported) +- atmel,usart-mode : Must be (found in dt-bindings/mfd/at91-usart.h) + +Optional properties in serial mode: - atmel,use-dma-rx: use of PDC or DMA for receiving data - atmel,use-dma-tx: use of PDC or DMA for transmitting data - {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD line respectively. @@ -62,3 +68,18 @@ Example: dma-names = "tx", "rx"; atmel,fifo-size = <32>; }; + +- SPI mode: + #include + + spi0: spi@f001c000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "atmel,at91rm9200-usart", "atmel,at91sam9260-usart"; + atmel,usart-mode = ; + reg = <0xf001c000 0x100>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH 5>; + clocks = <&usart0_clk>; + clock-names = "usart"; + cs-gpios = <&pioB 3 0>; + }; diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt index 3ca56fdb5ffe..a4b056761eaa 100644 --- a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt +++ b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt @@ -1,16 +1,17 @@ -* ROHM BD71837 Power Management Integrated Circuit bindings +* ROHM BD71837 and BD71847 Power Management Integrated Circuit bindings -BD71837MWV is a programmable Power Management IC for powering single-core, -dual-core, and quad-core SoCs such as NXP-i.MX 8M. It is optimized for -low BOM cost and compact solution footprint. It integrates 8 Buck -egulators and 7 LDOs to provide all the power rails required by the SoC and -the commonly used peripherals. +BD71837MWV and BD71847MWV are programmable Power Management ICs for powering +single-core, dual-core, and quad-core SoCs such as NXP-i.MX 8M. They are +optimized for low BOM cost and compact solution footprint. BD71837MWV +integrates 8 Buck regulators and 7 LDOs. BD71847MWV contains 6 Buck regulators +and 6 LDOs. -Datasheet for PMIC is available at: +Datasheet for BD71837 is available at: https://www.rohm.com/datasheet/BD71837MWV/bd71837mwv-e Required properties: - - compatible : Should be "rohm,bd71837". + - compatible : Should be "rohm,bd71837" for bd71837 + "rohm,bd71847" for bd71847. - reg : I2C slave address. - interrupt-parent : Phandle to the parent interrupt controller. - interrupts : The interrupt line the device is connected to. diff --git a/Documentation/devicetree/bindings/mips/mscc.txt b/Documentation/devicetree/bindings/mips/mscc.txt index ae15ec333542..bc817e984628 100644 --- a/Documentation/devicetree/bindings/mips/mscc.txt +++ b/Documentation/devicetree/bindings/mips/mscc.txt @@ -41,3 +41,19 @@ Example: compatible = "mscc,ocelot-cpu-syscon", "syscon"; reg = <0x70000000 0x2c>; }; + +o HSIO regs: + +The SoC has a few registers (HSIO) handling miscellaneous functionalities: +configuration and status of PLL5, RCOMP, SyncE, SerDes configurations and +status, SerDes muxing and a thermal sensor. + +Required properties: +- compatible: Should be "mscc,ocelot-hsio", "syscon", "simple-mfd" +- reg : Should contain registers location and length + +Example: + syscon@10d0000 { + compatible = "mscc,ocelot-hsio", "syscon", "simple-mfd"; + reg = <0x10d0000 0x10000>; + }; diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt index 6611a7c2053a..01fdc33a41d0 100644 --- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt +++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt @@ -9,6 +9,25 @@ blocks that can be used to create functional hardware objects/devices such as network interfaces, crypto accelerator instances, L2 switches, etc. +For an overview of the DPAA2 architecture and fsl-mc bus see: +Documentation/networking/dpaa2/overview.rst + +As described in the above overview, all DPAA2 objects in a DPRC share the +same hardware "isolation context" and a 10-bit value called an ICID +(isolation context id) is expressed by the hardware to identify +the requester. + +The generic 'iommus' property is insufficient to describe the relationship +between ICIDs and IOMMUs, so an iommu-map property is used to define +the set of possible ICIDs under a root DPRC and how they map to +an IOMMU. + +For generic IOMMU bindings, see +Documentation/devicetree/bindings/iommu/iommu.txt. + +For arm-smmu binding, see: +Documentation/devicetree/bindings/iommu/arm,smmu.txt. + Required properties: - compatible @@ -88,14 +107,34 @@ Sub-nodes: Value type: Definition: Specifies the phandle to the PHY device node associated with the this dpmac. +Optional properties: + +- iommu-map: Maps an ICID to an IOMMU and associated iommu-specifier + data. + + The property is an arbitrary number of tuples of + (icid-base,iommu,iommu-base,length). + + Any ICID i in the interval [icid-base, icid-base + length) is + associated with the listed IOMMU, with the iommu-specifier + (i - icid-base + iommu-base). Example: + smmu: iommu@5000000 { + compatible = "arm,mmu-500"; + #iommu-cells = <1>; + stream-match-mask = <0x7C00>; + ... + }; + fsl_mc: fsl-mc@80c000000 { compatible = "fsl,qoriq-mc"; reg = <0x00000008 0x0c000000 0 0x40>, /* MC portal base */ <0x00000000 0x08340000 0 0x40000>; /* MC control reg */ msi-parent = <&its>; + /* define map for ICIDs 23-64 */ + iommu-map = <23 &smmu 23 41>; #address-cells = <3>; #size-cells = <1>; diff --git a/Documentation/devicetree/bindings/misc/lwn-bk4.txt b/Documentation/devicetree/bindings/misc/lwn-bk4.txt new file mode 100644 index 000000000000..d6a8c188c087 --- /dev/null +++ b/Documentation/devicetree/bindings/misc/lwn-bk4.txt @@ -0,0 +1,26 @@ +* Liebherr's BK4 controller external SPI + +A device which handles data acquisition from compatible industrial +peripherals. +The SPI is used for data and management purposes in both master and +slave modes. + +Required properties: + +- compatible : Should be "lwn,bk4" + +Required SPI properties: + +- reg : Should be address of the device chip select within + the controller. + +- spi-max-frequency : Maximum SPI clocking speed of device in Hz, should be + 30MHz at most for the Liebherr's BK4 external bus. + +Example: + +spidev0: spi@0 { + compatible = "lwn,bk4"; + spi-max-frequency = <30000000>; + reg = <0>; +}; diff --git a/Documentation/devicetree/bindings/mmc/arasan,sdhci.txt b/Documentation/devicetree/bindings/mmc/arasan,sdhci.txt index f6ddba31cb73..e2effe17f05e 100644 --- a/Documentation/devicetree/bindings/mmc/arasan,sdhci.txt +++ b/Documentation/devicetree/bindings/mmc/arasan,sdhci.txt @@ -15,6 +15,7 @@ Required Properties: - "arasan,sdhci-5.1": generic Arasan SDHCI 5.1 PHY - "rockchip,rk3399-sdhci-5.1", "arasan,sdhci-5.1": rk3399 eMMC PHY For this device it is strongly suggested to include arasan,soc-ctl-syscon. + - "ti,am654-sdhci-5.1", "arasan,sdhci-5.1": TI AM654 MMC PHY - reg: From mmc bindings: Register location and length. - clocks: From clock bindings: Handles to clock inputs. - clock-names: From clock bindings: Tuple including "clk_xin" and "clk_ahb" diff --git a/Documentation/devicetree/bindings/mmc/jz4740.txt b/Documentation/devicetree/bindings/mmc/jz4740.txt index 7cd8c432d7c8..8a6f87f13114 100644 --- a/Documentation/devicetree/bindings/mmc/jz4740.txt +++ b/Documentation/devicetree/bindings/mmc/jz4740.txt @@ -7,6 +7,7 @@ described in mmc.txt. Required properties: - compatible: Should be one of the following: - "ingenic,jz4740-mmc" for the JZ4740 + - "ingenic,jz4725b-mmc" for the JZ4725B - "ingenic,jz4780-mmc" for the JZ4780 - reg: Should contain the MMC controller registers location and length. - interrupts: Should contain the interrupt specifier of the MMC controller. diff --git a/Documentation/devicetree/bindings/mmc/mmci.txt b/Documentation/devicetree/bindings/mmc/mmci.txt index 03796cf2d3e7..6d3c626e017d 100644 --- a/Documentation/devicetree/bindings/mmc/mmci.txt +++ b/Documentation/devicetree/bindings/mmc/mmci.txt @@ -15,8 +15,11 @@ Required properties: Optional properties: - arm,primecell-periphid : contains the PrimeCell Peripheral ID, it overrides the ID provided by the HW +- resets : phandle to internal reset line. + Should be defined for sdmmc variant. - vqmmc-supply : phandle to the regulator device tree node, mentioned as the VCCQ/VDD_IO supply in the eMMC/SD specs. +specific for ux500 variant: - st,sig-dir-dat0 : bus signal direction pin used for DAT[0]. - st,sig-dir-dat2 : bus signal direction pin used for DAT[2]. - st,sig-dir-dat31 : bus signal direction pin used for DAT[3] and DAT[1]. @@ -24,6 +27,14 @@ Optional properties: - st,sig-dir-cmd : cmd signal direction pin used for CMD. - st,sig-pin-fbclk : feedback clock signal pin used. +specific for sdmmc variant: +- st,sig-dir : signal direction polarity used for cmd, dat0 dat123. +- st,neg-edge : data & command phase relation, generated on + sd clock falling edge. +- st,use-ckin : use ckin pin from an external driver to sample + the receive data (example: with voltage + switch transceiver). + Deprecated properties: - mmc-cap-mmc-highspeed : indicates whether MMC is high speed capable. - mmc-cap-sd-highspeed : indicates whether SD is high speed capable. diff --git a/Documentation/devicetree/bindings/mmc/mtk-sd.txt b/Documentation/devicetree/bindings/mmc/mtk-sd.txt index f33467a54a05..f5bcda3980cc 100644 --- a/Documentation/devicetree/bindings/mmc/mtk-sd.txt +++ b/Documentation/devicetree/bindings/mmc/mtk-sd.txt @@ -10,6 +10,7 @@ Required properties: - compatible: value should be either of the following. "mediatek,mt8135-mmc": for mmc host ip compatible with mt8135 "mediatek,mt8173-mmc": for mmc host ip compatible with mt8173 + "mediatek,mt8183-mmc": for mmc host ip compatible with mt8183 "mediatek,mt2701-mmc": for mmc host ip compatible with mt2701 "mediatek,mt2712-mmc": for mmc host ip compatible with mt2712 "mediatek,mt7622-mmc": for MT7622 SoC @@ -22,6 +23,7 @@ Required properties: "source" - source clock (required) "hclk" - HCLK which used for host (required) "source_cg" - independent source clock gate (required for MT2712) + "bus_clk" - bus clock used for internal register access (required for MT2712 MSDC0/3) - pinctrl-names: should be "default", "state_uhs" - pinctrl-0: should contain default/high speed pin ctrl - pinctrl-1: should contain uhs mode pin ctrl diff --git a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt index 9bce57862ed6..32b4b4e41923 100644 --- a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt +++ b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt @@ -38,3 +38,75 @@ sdhci@c8000200 { power-gpios = <&gpio 155 0>; /* gpio PT3 */ bus-width = <8>; }; + +Optional properties for Tegra210 and Tegra186: +- pinctrl-names, pinctrl-0, pinctrl-1 : Specify pad voltage + configurations. Valid pinctrl-names are "sdmmc-3v3" and "sdmmc-1v8" + for controllers supporting multiple voltage levels. The order of names + should correspond to the pin configuration states in pinctrl-0 and + pinctrl-1. +- nvidia,only-1-8-v : The presence of this property indicates that the + controller operates at a 1.8 V fixed I/O voltage. +- nvidia,pad-autocal-pull-up-offset-3v3, + nvidia,pad-autocal-pull-down-offset-3v3 : Specify drive strength + calibration offsets for 3.3 V signaling modes. +- nvidia,pad-autocal-pull-up-offset-1v8, + nvidia,pad-autocal-pull-down-offset-1v8 : Specify drive strength + calibration offsets for 1.8 V signaling modes. +- nvidia,pad-autocal-pull-up-offset-3v3-timeout, + nvidia,pad-autocal-pull-down-offset-3v3-timeout : Specify drive + strength used as a fallback in case the automatic calibration times + out on a 3.3 V signaling mode. +- nvidia,pad-autocal-pull-up-offset-1v8-timeout, + nvidia,pad-autocal-pull-down-offset-1v8-timeout : Specify drive + strength used as a fallback in case the automatic calibration times + out on a 1.8 V signaling mode. +- nvidia,pad-autocal-pull-up-offset-sdr104, + nvidia,pad-autocal-pull-down-offset-sdr104 : Specify drive strength + calibration offsets for SDR104 mode. +- nvidia,pad-autocal-pull-up-offset-hs400, + nvidia,pad-autocal-pull-down-offset-hs400 : Specify drive strength + calibration offsets for HS400 mode. +- nvidia,default-tap : Specify the default inbound sampling clock + trimmer value for non-tunable modes. +- nvidia,default-trim : Specify the default outbound clock trimmer + value. +- nvidia,dqs-trim : Specify DQS trim value for HS400 timing + + Notes on the pad calibration pull up and pulldown offset values: + - The property values are drive codes which are programmed into the + PD_OFFSET and PU_OFFSET sections of the + SDHCI_TEGRA_AUTO_CAL_CONFIG register. + - A higher value corresponds to higher drive strength. Please refer + to the reference manual of the SoC for correct values. + - The SDR104 and HS400 timing specific values are used in + corresponding modes if specified. + + Notes on tap and trim values: + - The values are used for compensating trace length differences + by adjusting the sampling point. + - The values are programmed to the Vendor Clock Control Register. + Please refer to the reference manual of the SoC for correct + values. + - The DQS trim values are only used on controllers which support + HS400 timing. Only SDMMC4 on Tegra210 and Tegra 186 supports + HS400. + +Example: +sdhci@700b0000 { + compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0000 0x0 0x200>; + interrupts = ; + clocks = <&tegra_car TEGRA210_CLK_SDMMC1>; + clock-names = "sdhci"; + resets = <&tegra_car 14>; + reset-names = "sdhci"; + pinctrl-names = "sdmmc-3v3", "sdmmc-1v8"; + pinctrl-0 = <&sdmmc1_3v3>; + pinctrl-1 = <&sdmmc1_1v8>; + nvidia,pad-autocal-pull-up-offset-3v3 = <0x00>; + nvidia,pad-autocal-pull-down-offset-3v3 = <0x7d>; + nvidia,pad-autocal-pull-up-offset-1v8 = <0x7b>; + nvidia,pad-autocal-pull-down-offset-1v8 = <0x7b>; + status = "disabled"; +}; diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt index 5ff1e12c655a..c064af5838aa 100644 --- a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt +++ b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt @@ -12,6 +12,7 @@ Required properties: - "renesas,mmcif-r8a73a4" for the MMCIF found in r8a73a4 SoCs - "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs - "renesas,mmcif-r8a7743" for the MMCIF found in r8a7743 SoCs + - "renesas,mmcif-r8a7744" for the MMCIF found in r8a7744 SoCs - "renesas,mmcif-r8a7745" for the MMCIF found in r8a7745 SoCs - "renesas,mmcif-r8a7778" for the MMCIF found in r8a7778 SoCs - "renesas,mmcif-r8a7790" for the MMCIF found in r8a7790 SoCs @@ -23,7 +24,8 @@ Required properties: - interrupts: Some SoCs have only 1 shared interrupt, while others have either 2 or 3 individual interrupts (error, int, card detect). Below is the number of interrupts for each SoC: - 1: r8a73a4, r8a7743, r8a7745, r8a7778, r8a7790, r8a7791, r8a7793, r8a7794 + 1: r8a73a4, r8a7743, r8a7744, r8a7745, r8a7778, r8a7790, r8a7791, r8a7793, + r8a7794 2: r8a7740, sh73a0 3: r7s72100 diff --git a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt new file mode 100644 index 000000000000..45c9978aad7b --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt @@ -0,0 +1,41 @@ +* Spreadtrum SDHCI controller (sdhci-sprd) + +The Secure Digital (SD) Host controller on Spreadtrum SoCs provides an interface +for MMC, SD and SDIO types of cards. + +This file documents differences between the core properties in mmc.txt +and the properties used by the sdhci-sprd driver. + +Required properties: +- compatible: Should contain "sprd,sdhci-r11". +- reg: physical base address of the controller and length. +- interrupts: Interrupts used by the SDHCI controller. +- clocks: Should contain phandle for the clock feeding the SDHCI controller +- clock-names: Should contain the following: + "sdio" - SDIO source clock (required) + "enable" - gate clock which used for enabling/disabling the device (required) + +Optional properties: +- assigned-clocks: the same with "sdio" clock +- assigned-clock-parents: the default parent of "sdio" clock + +Examples: + +sdio0: sdio@20600000 { + compatible = "sprd,sdhci-r11"; + reg = <0 0x20600000 0 0x1000>; + interrupts = ; + + clock-names = "sdio", "enable"; + clocks = <&ap_clk CLK_EMMC_2X>, + <&apahb_gate CLK_EMMC_EB>; + assigned-clocks = <&ap_clk CLK_EMMC_2X>; + assigned-clock-parents = <&rpll CLK_RPLL_390M>; + + bus-width = <8>; + non-removable; + no-sdio; + no-sd; + cap-mmc-hw-reset; + status = "okay"; +}; diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt index c434200d19d5..27f2eab2981d 100644 --- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt +++ b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt @@ -16,7 +16,11 @@ Required properties: "renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC "renesas,sdhi-r8a7740" - SDHI IP on R8A7740 SoC "renesas,sdhi-r8a7743" - SDHI IP on R8A7743 SoC + "renesas,sdhi-r8a7744" - SDHI IP on R8A7744 SoC "renesas,sdhi-r8a7745" - SDHI IP on R8A7745 SoC + "renesas,sdhi-r8a774a1" - SDHI IP on R8A774A1 SoC + "renesas,sdhi-r8a77470" - SDHI IP on R8A77470 SoC + "renesas,sdhi-mmc-r8a77470" - SDHI/MMC IP on R8A77470 SoC "renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC "renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC "renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC @@ -27,14 +31,16 @@ Required properties: "renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC "renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC "renesas,sdhi-r8a77965" - SDHI IP on R8A77965 SoC + "renesas,sdhi-r8a77970" - SDHI IP on R8A77970 SoC "renesas,sdhi-r8a77980" - SDHI IP on R8A77980 SoC "renesas,sdhi-r8a77990" - SDHI IP on R8A77990 SoC "renesas,sdhi-r8a77995" - SDHI IP on R8A77995 SoC "renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller "renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller - "renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 or RZ/G1 + "renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 and RZ/G1 SDHI + (not SDHI/MMC) controller + "renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 or RZ/G2 SDHI controller - "renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 SDHI controller When compatible with the generic version, nodes must list diff --git a/Documentation/devicetree/bindings/mmc/uniphier-sd.txt b/Documentation/devicetree/bindings/mmc/uniphier-sd.txt new file mode 100644 index 000000000000..e1d658755722 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/uniphier-sd.txt @@ -0,0 +1,55 @@ +UniPhier SD/eMMC controller + +Required properties: +- compatible: should be one of the following: + "socionext,uniphier-sd-v2.91" - IP version 2.91 + "socionext,uniphier-sd-v3.1" - IP version 3.1 + "socionext,uniphier-sd-v3.1.1" - IP version 3.1.1 +- reg: offset and length of the register set for the device. +- interrupts: a single interrupt specifier. +- clocks: a single clock specifier of the controller clock. +- reset-names: should contain the following: + "host" - mandatory for all versions + "bridge" - should exist only for "socionext,uniphier-sd-v2.91" + "hw" - should exist if eMMC hw reset line is available +- resets: a list of reset specifiers, corresponding to the reset-names + +Optional properties: +- pinctrl-names: if present, should contain the following: + "default" - should exist for all instances + "uhs" - should exist for SD instance with UHS support +- pinctrl-0: pin control state for the default mode +- pinctrl-1: pin control state for the UHS mode +- dma-names: should be "rx-tx" if present. + This property can exist only for "socionext,uniphier-sd-v2.91". +- dmas: a single DMA channel specifier + This property can exist only for "socionext,uniphier-sd-v2.91". +- bus-width: see mmc.txt +- cap-sd-highspeed: see mmc.txt +- cap-mmc-highspeed: see mmc.txt +- sd-uhs-sdr12: see mmc.txt +- sd-uhs-sdr25: see mmc.txt +- sd-uhs-sdr50: see mmc.txt +- cap-mmc-hw-reset: should exist if reset-names contains "hw". see mmc.txt +- non-removable: see mmc.txt + +Example: + + sd: sdhc@5a400000 { + compatible = "socionext,uniphier-sd-v2.91"; + reg = <0x5a400000 0x200>; + interrupts = <0 76 4>; + pinctrl-names = "default", "uhs"; + pinctrl-0 = <&pinctrl_sd>; + pinctrl-1 = <&pinctrl_sd_uhs>; + clocks = <&mio_clk 0>; + reset-names = "host", "bridge"; + resets = <&mio_rst 0>, <&mio_rst 3>; + dma-names = "rx-tx"; + dmas = <&dmac 4>; + bus-width = <4>; + cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + }; diff --git a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt index 4648948f7c3b..e15589f47787 100644 --- a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt +++ b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt @@ -19,6 +19,9 @@ Optional properties: - interrupt-names: must be "mdio_done_error" when there is a share interrupt fed to this hardware block, or must be "mdio_done" for the first interrupt and "mdio_error" for the second when there are separate interrupts +- clocks: A reference to the clock supplying the MDIO bus controller +- clock-frequency: the MDIO bus clock that must be output by the MDIO bus + hardware, if absent, the default hardware values are used Child nodes of this MDIO bus controller node are standard Ethernet PHY device nodes as described in Documentation/devicetree/bindings/net/phy.txt diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt index 94a7f33ac5e9..cc4372842bf3 100644 --- a/Documentation/devicetree/bindings/net/can/rcar_can.txt +++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt @@ -3,6 +3,7 @@ Renesas R-Car CAN controller Device Tree Bindings Required properties: - compatible: "renesas,can-r8a7743" if CAN controller is a part of R8A7743 SoC. + "renesas,can-r8a7744" if CAN controller is a part of R8A7744 SoC. "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC. "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC. "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC. diff --git a/Documentation/devicetree/bindings/net/dsa/b53.txt b/Documentation/devicetree/bindings/net/dsa/b53.txt index 1811e1972a7a..5201bc15fdd6 100644 --- a/Documentation/devicetree/bindings/net/dsa/b53.txt +++ b/Documentation/devicetree/bindings/net/dsa/b53.txt @@ -46,6 +46,42 @@ Required properties: "brcm,bcm6328-switch" "brcm,bcm6368-switch" and the mandatory "brcm,bcm63xx-switch" +Required properties for BCM585xx/586xx/88312 SoCs: + + - reg: a total of 3 register base addresses, the first one must be the + Switch Register Access block base, the second is the port 5/4 mux + configuration register and the third one is the SGMII configuration + and status register base address. + + - interrupts: a total of 13 interrupts must be specified, in the following + order: port 0-5, 7-8 link status change, then the integrated PHY interrupt, + then the timestamping interrupt and the sleep timer interrupts for ports + 5,7,8. + +Optional properties for BCM585xx/586xx/88312 SoCs: + + - reg-names: a total of 3 names matching the 3 base register address, must + be in the following order: + "srab" + "mux_config" + "sgmii_config" + + - interrupt-names: a total of 13 names matching the 13 interrupts specified + must be in the following order: + "link_state_p0" + "link_state_p1" + "link_state_p2" + "link_state_p3" + "link_state_p4" + "link_state_p5" + "link_state_p7" + "link_state_p8" + "phy" + "ts" + "imp_sleep_timer_p5" + "imp_sleep_timer_p7" + "imp_sleep_timer_p8" + See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional required and optional properties. diff --git a/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt b/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt new file mode 100644 index 000000000000..886cbe8ffb38 --- /dev/null +++ b/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt @@ -0,0 +1,143 @@ +Lantiq GSWIP Ethernet switches +================================== + +Required properties for GSWIP core: + +- compatible : "lantiq,xrx200-gswip" for the embedded GSWIP in the + xRX200 SoC +- reg : memory range of the GSWIP core registers + : memory range of the GSWIP MDIO registers + : memory range of the GSWIP MII registers + +See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of +additional required and optional properties. + + +Required properties for MDIO bus: +- compatible : "lantiq,xrx200-mdio" for the MDIO bus inside the GSWIP + core of the xRX200 SoC and the PHYs connected to it. + +See Documentation/devicetree/bindings/net/mdio.txt for a list of additional +required and optional properties. + + +Required properties for GPHY firmware loading: +- compatible : "lantiq,xrx200-gphy-fw", "lantiq,gphy-fw" + "lantiq,xrx300-gphy-fw", "lantiq,gphy-fw" + "lantiq,xrx330-gphy-fw", "lantiq,gphy-fw" + for the loading of the firmware into the embedded + GPHY core of the SoC. +- lantiq,rcu : reference to the rcu syscon + +The GPHY firmware loader has a list of GPHY entries, one for each +embedded GPHY + +- reg : Offset of the GPHY firmware register in the RCU + register range +- resets : list of resets of the embedded GPHY +- reset-names : list of names of the resets + +Example: + +Ethernet switch on the VRX200 SoC: + +switch@e108000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "lantiq,xrx200-gswip"; + reg = < 0xe108000 0x3100 /* switch */ + 0xe10b100 0xd8 /* mdio */ + 0xe10b1d8 0x130 /* mii */ + >; + dsa,member = <0 0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "lan3"; + phy-mode = "rgmii"; + phy-handle = <&phy0>; + }; + + port@1 { + reg = <1>; + label = "lan4"; + phy-mode = "rgmii"; + phy-handle = <&phy1>; + }; + + port@2 { + reg = <2>; + label = "lan2"; + phy-mode = "internal"; + phy-handle = <&phy11>; + }; + + port@4 { + reg = <4>; + label = "lan1"; + phy-mode = "internal"; + phy-handle = <&phy13>; + }; + + port@5 { + reg = <5>; + label = "wan"; + phy-mode = "rgmii"; + phy-handle = <&phy5>; + }; + + port@6 { + reg = <0x6>; + label = "cpu"; + ethernet = <ð0>; + }; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "lantiq,xrx200-mdio"; + reg = <0>; + + phy0: ethernet-phy@0 { + reg = <0x0>; + }; + phy1: ethernet-phy@1 { + reg = <0x1>; + }; + phy5: ethernet-phy@5 { + reg = <0x5>; + }; + phy11: ethernet-phy@11 { + reg = <0x11>; + }; + phy13: ethernet-phy@13 { + reg = <0x13>; + }; + }; + + gphy-fw { + compatible = "lantiq,xrx200-gphy-fw", "lantiq,gphy-fw"; + lantiq,rcu = <&rcu0>; + #address-cells = <1>; + #size-cells = <0>; + + gphy@20 { + reg = <0x20>; + + resets = <&reset0 31 30>; + reset-names = "gphy"; + }; + + gphy@68 { + reg = <0x68>; + + resets = <&reset0 29 28>; + reset-names = "gphy"; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt new file mode 100644 index 000000000000..5ff5e68bbbb6 --- /dev/null +++ b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt @@ -0,0 +1,21 @@ +Lantiq xRX200 GSWIP PMAC Ethernet driver +================================== + +Required properties: + +- compatible : "lantiq,xrx200-net" for the PMAC of the embedded + : GSWIP in the xXR200 +- reg : memory range of the PMAC core inside of the GSWIP core +- interrupts : TX and RX DMA interrupts. Use interrupt-names "tx" for + : the TX interrupt and "rx" for the RX interrupt. + +Example: + +ethernet@e10b308 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "lantiq,xrx200-net"; + reg = <0xe10b308 0xcf8>; + interrupts = <73>, <72>; + interrupt-names = "tx", "rx"; +}; diff --git a/Documentation/devicetree/bindings/net/marvell,prestera.txt b/Documentation/devicetree/bindings/net/marvell,prestera.txt index c329608fa887..83370ebf5b89 100644 --- a/Documentation/devicetree/bindings/net/marvell,prestera.txt +++ b/Documentation/devicetree/bindings/net/marvell,prestera.txt @@ -2,7 +2,7 @@ Marvell Prestera Switch Chip bindings ------------------------------------- Required properties: -- compatible: one of the following +- compatible: must be "marvell,prestera" and one of the following "marvell,prestera-98dx3236", "marvell,prestera-98dx3336", "marvell,prestera-98dx4251", @@ -21,7 +21,7 @@ switch { ranges = <0 MBUS_ID(0x03, 0x00) 0 0x100000>; packet-processor@0 { - compatible = "marvell,prestera-98dx3236"; + compatible = "marvell,prestera-98dx3236", "marvell,prestera"; reg = <0 0x4000000>; interrupts = <33>, <34>, <35>; dfx = <&dfx>; diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index fc019df0d863..b78397669320 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -31,7 +31,7 @@ required. Required properties (port): -- interrupts: interrupt for the port +- interrupts: interrupt(s) for the port - port-id: ID of the port from the MAC point of view - gop-port-id: only for marvell,armada-7k-pp2, ID of the port from the GOP (Group Of Ports) point of view. This ID is used to index the @@ -43,10 +43,12 @@ Optional properties (port): - marvell,loopback: port is loopback mode - phy: a phandle to a phy node defining the PHY address (as the reg property, a single integer). -- interrupt-names: if more than a single interrupt for rx is given, must - be the name associated to the interrupts listed. Valid - names are: "tx-cpu0", "tx-cpu1", "tx-cpu2", "tx-cpu3", - "rx-shared", "link". +- interrupt-names: if more than a single interrupt for is given, must be the + name associated to the interrupts listed. Valid names are: + "hifX", with X in [0..8], and "link". The names "tx-cpu0", + "tx-cpu1", "tx-cpu2", "tx-cpu3" and "rx-shared" are supported + for backward compatibility but shouldn't be used for new + additions. - marvell,system-controller: a phandle to the system controller. Example for marvell,armada-375-pp2: @@ -89,9 +91,14 @@ cpm_ethernet: ethernet@0 { , , , - ; - interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", - "tx-cpu3", "rx-shared"; + , + , + , + , + , + ; + interrupt-names = "hif0", "hif1", "hif2", "hif3", "hif4", + "hif5", "hif6", "hif7", "hif8", "link"; port-id = <0>; gop-port-id = <0>; }; @@ -101,9 +108,14 @@ cpm_ethernet: ethernet@0 { , , , - ; - interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", - "tx-cpu3", "rx-shared"; + , + , + , + , + , + ; + interrupt-names = "hif0", "hif1", "hif2", "hif3", "hif4", + "hif5", "hif6", "hif7", "hif8", "link"; port-id = <1>; gop-port-id = <2>; }; @@ -113,9 +125,14 @@ cpm_ethernet: ethernet@0 { , , , - ; - interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", - "tx-cpu3", "rx-shared"; + , + , + , + , + , + ; + interrupt-names = "hif0", "hif1", "hif2", "hif3", "hif4", + "hif5", "hif6", "hif7", "hif8", "link"; port-id = <2>; gop-port-id = <3>; }; diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt index e22d8cfea687..5100358177c9 100644 --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt @@ -1,4 +1,4 @@ -Micrel KSZ9021/KSZ9031 Gigabit Ethernet PHY +Micrel KSZ9021/KSZ9031/KSZ9131 Gigabit Ethernet PHY Some boards require special tuning values, particularly when it comes to clock delays. You can specify clock delay values in the PHY OF @@ -64,6 +64,32 @@ KSZ9031: Attention: The link partner must be configurable as slave otherwise no link will be established. +KSZ9131: + + All skew control options are specified in picoseconds. The increment + step is 100ps. Unlike KSZ9031, the values represent picoseccond delays. + A negative value can be assigned as rxc-skew-psec = <(-100)>;. + + Optional properties: + + Range of the value -700 to 2400, default value 0: + + - rxc-skew-psec : Skew control of RX clock pad + - txc-skew-psec : Skew control of TX clock pad + + Range of the value -700 to 800, default value 0: + + - rxdv-skew-psec : Skew control of RX CTL pad + - txen-skew-psec : Skew control of TX CTL pad + - rxd0-skew-psec : Skew control of RX data 0 pad + - rxd1-skew-psec : Skew control of RX data 1 pad + - rxd2-skew-psec : Skew control of RX data 2 pad + - rxd3-skew-psec : Skew control of RX data 3 pad + - txd0-skew-psec : Skew control of TX data 0 pad + - txd1-skew-psec : Skew control of TX data 1 pad + - txd2-skew-psec : Skew control of TX data 2 pad + - txd3-skew-psec : Skew control of TX data 3 pad + Examples: mdio { diff --git a/Documentation/devicetree/bindings/net/mscc-ocelot.txt b/Documentation/devicetree/bindings/net/mscc-ocelot.txt index 0a84711abece..9e5c17d426ce 100644 --- a/Documentation/devicetree/bindings/net/mscc-ocelot.txt +++ b/Documentation/devicetree/bindings/net/mscc-ocelot.txt @@ -12,7 +12,6 @@ Required properties: - "sys" - "rew" - "qs" - - "hsio" - "qsys" - "ana" - "portX" with X from 0 to the number of last port index available on that @@ -45,7 +44,6 @@ Example: reg = <0x1010000 0x10000>, <0x1030000 0x10000>, <0x1080000 0x100>, - <0x10d0000 0x10000>, <0x11e0000 0x100>, <0x11f0000 0x100>, <0x1200000 0x100>, @@ -59,10 +57,9 @@ Example: <0x1280000 0x100>, <0x1800000 0x80000>, <0x1880000 0x10000>; - reg-names = "sys", "rew", "qs", "hsio", "port0", - "port1", "port2", "port3", "port4", "port5", - "port6", "port7", "port8", "port9", "port10", - "qsys", "ana"; + reg-names = "sys", "rew", "qs", "port0", "port1", "port2", + "port3", "port4", "port5", "port6", "port7", + "port8", "port9", "port10", "qsys", "ana"; interrupts = <21 22>; interrupt-names = "xtr", "inj"; diff --git a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt index 0eedabe22cc3..5ff37c68c941 100644 --- a/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt +++ b/Documentation/devicetree/bindings/net/mscc-phy-vsc8531.txt @@ -1,10 +1,5 @@ * Microsemi - vsc8531 Giga bit ethernet phy -Required properties: -- compatible : Should contain phy id as "ethernet-phy-idAAAA.BBBB" - The PHY device uses the binding described in - Documentation/devicetree/bindings/net/phy.txt - Optional properties: - vsc8531,vddmac : The vddmac in mV. Allowed values is listed in the first row of Table 1 (below). @@ -27,14 +22,16 @@ Optional properties: 'vddmac'. Default value is 0%. Ref: Table:1 - Edge rate change (below). -- vsc8531,led-0-mode : LED mode. Specify how the LED[0] should behave. - Allowed values are define in - "include/dt-bindings/net/mscc-phy-vsc8531.h". - Default value is VSC8531_LINK_1000_ACTIVITY (1). -- vsc8531,led-1-mode : LED mode. Specify how the LED[1] should behave. - Allowed values are define in +- vsc8531,led-[N]-mode : LED mode. Specify how the LED[N] should behave. + N depends on the number of LEDs supported by a + PHY. + Allowed values are defined in "include/dt-bindings/net/mscc-phy-vsc8531.h". - Default value is VSC8531_LINK_100_ACTIVITY (2). + Default values are VSC8531_LINK_1000_ACTIVITY (1), + VSC8531_LINK_100_ACTIVITY (2), + VSC8531_LINK_ACTIVITY (0) and + VSC8531_DUPLEX_COLLISION (8). + Table: 1 - Edge rate change ----------------------------------------------------------------| diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index da249b7c406c..3530256a879c 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -6,6 +6,7 @@ interface contains. Required properties: - compatible: Must contain one or more of the following: - "renesas,etheravb-r8a7743" for the R8A7743 SoC. + - "renesas,etheravb-r8a7744" for the R8A7744 SoC. - "renesas,etheravb-r8a7745" for the R8A7745 SoC. - "renesas,etheravb-r8a77470" for the R8A77470 SoC. - "renesas,etheravb-r8a7790" for the R8A7790 SoC. diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt index 7fd4e8ce4149..2196d1ab3c8c 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt @@ -56,6 +56,11 @@ Optional properties: the length can vary between hw versions. - -supply: handle to the regulator device tree node optional "supply-name" is "vdd-0.8-cx-mx". +- memory-region: + Usage: optional + Value type: + Definition: reference to the reserved-memory for the msa region + used by the wifi firmware running in Q6. Example (to supply the calibration data alone): @@ -149,4 +154,5 @@ wifi@18000000 { <0 140 0 /* CE10 */ >, <0 141 0 /* CE11 */ >; vdd-0.8-cx-mx-supply = <&pm8998_l5>; + memory-region = <&wifi_msa_mem>; }; diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt index e319fe5e205a..99c4ba6a3f61 100644 --- a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt +++ b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt @@ -7,6 +7,7 @@ Required properties: "allwinner,sun8i-a83t-sid" "allwinner,sun8i-h3-sid" "allwinner,sun50i-a64-sid" + "allwinner,sun50i-h5-sid" - reg: Should contain registers location and length diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt index cb33421184a0..f37494d5a7be 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt @@ -50,6 +50,7 @@ Additional required properties for imx7d-pcie: - reset-names: Must contain the following entires: - "pciephy" - "apps" + - "turnoff" Example: diff --git a/Documentation/devicetree/bindings/pci/pci-keystone.txt b/Documentation/devicetree/bindings/pci/pci-keystone.txt index 4dd17de549a7..2030ee0dc4f9 100644 --- a/Documentation/devicetree/bindings/pci/pci-keystone.txt +++ b/Documentation/devicetree/bindings/pci/pci-keystone.txt @@ -19,6 +19,9 @@ pcie_msi_intc : Interrupt controller device node for MSI IRQ chip interrupt-cells: should be set to 1 interrupts: GIC interrupt lines connected to PCI MSI interrupt lines +ti,syscon-pcie-id : phandle to the device control module required to set device + id and vendor id. + Example: pcie_msi_intc: msi-interrupt-controller { interrupt-controller; diff --git a/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt b/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt index 9fe7e12a7bf3..b94078f58d8e 100644 --- a/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt +++ b/Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt @@ -7,6 +7,7 @@ OHCI and EHCI controllers. Required properties: - compatible: "renesas,pci-r8a7743" for the R8A7743 SoC; + "renesas,pci-r8a7744" for the R8A7744 SoC; "renesas,pci-r8a7745" for the R8A7745 SoC; "renesas,pci-r8a7790" for the R8A7790 SoC; "renesas,pci-r8a7791" for the R8A7791 SoC; diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt index a5f7fc62d10e..976ef7bfff93 100644 --- a/Documentation/devicetree/bindings/pci/rcar-pci.txt +++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt @@ -2,6 +2,7 @@ Required properties: compatible: "renesas,pcie-r8a7743" for the R8A7743 SoC; + "renesas,pcie-r8a7744" for the R8A7744 SoC; "renesas,pcie-r8a7779" for the R8A7779 SoC; "renesas,pcie-r8a7790" for the R8A7790 SoC; "renesas,pcie-r8a7791" for the R8A7791 SoC; @@ -9,6 +10,7 @@ compatible: "renesas,pcie-r8a7743" for the R8A7743 SoC; "renesas,pcie-r8a7795" for the R8A7795 SoC; "renesas,pcie-r8a7796" for the R8A7796 SoC; "renesas,pcie-r8a77980" for the R8A77980 SoC; + "renesas,pcie-r8a77990" for the R8A77990 SoC; "renesas,pcie-rcar-gen2" for a generic R-Car Gen2 or RZ/G1 compatible device. "renesas,pcie-rcar-gen3" for a generic R-Car Gen3 compatible device. diff --git a/Documentation/devicetree/bindings/pci/ti-pci.txt b/Documentation/devicetree/bindings/pci/ti-pci.txt index 7f7af3044016..452fe48c4fdd 100644 --- a/Documentation/devicetree/bindings/pci/ti-pci.txt +++ b/Documentation/devicetree/bindings/pci/ti-pci.txt @@ -26,6 +26,11 @@ HOST MODE ranges, interrupt-map-mask, interrupt-map : as specified in ../designware-pcie.txt + - ti,syscon-unaligned-access: phandle to the syscon DT node. The 1st argument + should contain the register offset within syscon + and the 2nd argument should contain the bit field + for setting the bit to enable unaligned + access. DEVICE MODE =========== diff --git a/Documentation/devicetree/bindings/phy/brcm-sata-phy.txt b/Documentation/devicetree/bindings/phy/brcm-sata-phy.txt index 0aced97d8092..b640845fec67 100644 --- a/Documentation/devicetree/bindings/phy/brcm-sata-phy.txt +++ b/Documentation/devicetree/bindings/phy/brcm-sata-phy.txt @@ -8,6 +8,7 @@ Required properties: "brcm,iproc-nsp-sata-phy" "brcm,phy-sata3" "brcm,iproc-sr-sata-phy" + "brcm,bcm63138-sata-phy" - address-cells: should be 1 - size-cells: should be 0 - reg: register ranges for the PHY PCB interface diff --git a/Documentation/devicetree/bindings/phy/phy-cadence-dp.txt b/Documentation/devicetree/bindings/phy/phy-cadence-dp.txt new file mode 100644 index 000000000000..7f49fd54ebc1 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-cadence-dp.txt @@ -0,0 +1,30 @@ +Cadence MHDP DisplayPort SD0801 PHY binding +=========================================== + +This binding describes the Cadence SD0801 PHY hardware included with +the Cadence MHDP DisplayPort controller. + +------------------------------------------------------------------------------- +Required properties (controller (parent) node): +- compatible : Should be "cdns,dp-phy" +- reg : Defines the following sets of registers in the parent + mhdp device: + - Offset of the DPTX PHY configuration registers + - Offset of the SD0801 PHY configuration registers +- #phy-cells : from the generic PHY bindings, must be 0. + +Optional properties: +- num_lanes : Number of DisplayPort lanes to use (1, 2 or 4) +- max_bit_rate : Maximum DisplayPort link bit rate to use, in Mbps (2160, + 2430, 2700, 3240, 4320, 5400 or 8100) +------------------------------------------------------------------------------- + +Example: + dp_phy: phy@f0fb030a00 { + compatible = "cdns,dp-phy"; + reg = <0xf0 0xfb030a00 0x0 0x00000040>, + <0xf0 0xfb500000 0x0 0x00100000>; + num_lanes = <4>; + max_bit_rate = <8100>; + #phy-cells = <0>; + }; diff --git a/Documentation/devicetree/bindings/phy/phy-ocelot-serdes.txt b/Documentation/devicetree/bindings/phy/phy-ocelot-serdes.txt new file mode 100644 index 000000000000..332219860187 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-ocelot-serdes.txt @@ -0,0 +1,43 @@ +Microsemi Ocelot SerDes muxing driver +------------------------------------- + +On Microsemi Ocelot, there is a handful of registers in HSIO address +space for setting up the SerDes to switch port muxing. + +A SerDes X can be "muxed" to work with switch port Y or Z for example. +One specific SerDes can also be used as a PCIe interface. + +Hence, a SerDes represents an interface, be it an Ethernet or a PCIe one. + +There are two kinds of SerDes: SERDES1G supports 10/100Mbps in +half/full-duplex and 1000Mbps in full-duplex mode while SERDES6G supports +10/100Mbps in half/full-duplex and 1000/2500Mbps in full-duplex mode. + +Also, SERDES6G number (aka "macro") 0 is the only interface supporting +QSGMII. + +This is a child of the HSIO syscon ("mscc,ocelot-hsio", see +Documentation/devicetree/bindings/mips/mscc.txt) on the Microsemi Ocelot. + +Required properties: + +- compatible: should be "mscc,vsc7514-serdes" +- #phy-cells : from the generic phy bindings, must be 2. + The first number defines the input port to use for a given + SerDes macro. The second defines the macro to use. They are + defined in dt-bindings/phy/phy-ocelot-serdes.h + +Example: + + serdes: serdes { + compatible = "mscc,vsc7514-serdes"; + #phy-cells = <2>; + }; + + ethernet { + port1 { + phy-handle = <&phy_foo>; + /* Link SERDES1G_5 to port1 */ + phys = <&serdes 1 SERDES1G_5>; + }; + }; diff --git a/Documentation/devicetree/bindings/phy/phy-rockchip-inno-hdmi.txt b/Documentation/devicetree/bindings/phy/phy-rockchip-inno-hdmi.txt new file mode 100644 index 000000000000..710cccd5ee56 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-rockchip-inno-hdmi.txt @@ -0,0 +1,43 @@ +ROCKCHIP HDMI PHY WITH INNO IP BLOCK + +Required properties: + - compatible : should be one of the listed compatibles: + * "rockchip,rk3228-hdmi-phy", + * "rockchip,rk3328-hdmi-phy"; + - reg : Address and length of the hdmi phy control register set + - clocks : phandle + clock specifier for the phy clocks + - clock-names : string, clock name, must contain "sysclk" for system + control and register configuration, "refoclk" for crystal- + oscillator reference PLL clock input and "refpclk" for pclk- + based refeference PLL clock input. + - #clock-cells: should be 0. + - clock-output-names : shall be the name for the output clock. + - interrupts : phandle + interrupt specified for the hdmiphy interrupt + - #phy-cells : must be 0. See ./phy-bindings.txt for details. + +Optional properties for rk3328-hdmi-phy: + - nvmem-cells = phandle + nvmem specifier for the cpu-version efuse + - nvmem-cell-names : "cpu-version" to read the chip version, required + for adjustment to some frequency settings + +Example: + hdmi_phy: hdmi-phy@12030000 { + compatible = "rockchip,rk3228-hdmi-phy"; + reg = <0x12030000 0x10000>; + #phy-cells = <0>; + clocks = <&cru PCLK_HDMI_PHY>, <&xin24m>, <&cru DCLK_HDMIPHY>; + clock-names = "sysclk", "refoclk", "refpclk"; + #clock-cells = <0>; + clock-output-names = "hdmi_phy"; + status = "disabled"; + }; + +Then the PHY can be used in other nodes such as: + + hdmi: hdmi@200a0000 { + compatible = "rockchip,rk3228-dw-hdmi"; + ... + phys = <&hdmi_phy>; + phy-names = "hdmi"; + ... + }; diff --git a/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt index 0c7629e88bf3..adf20b2bdf71 100644 --- a/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt +++ b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt @@ -10,16 +10,20 @@ Required properties: "qcom,msm8996-qmp-pcie-phy" for 14nm PCIe phy on msm8996, "qcom,msm8996-qmp-usb3-phy" for 14nm USB3 phy on msm8996, "qcom,sdm845-qmp-usb3-phy" for USB3 QMP V3 phy on sdm845, - "qcom,sdm845-qmp-usb3-uni-phy" for USB3 QMP V3 UNI phy on sdm845. + "qcom,sdm845-qmp-usb3-uni-phy" for USB3 QMP V3 UNI phy on sdm845, + "qcom,sdm845-qmp-ufs-phy" for UFS QMP phy on sdm845. - - reg: - - For "qcom,sdm845-qmp-usb3-phy": - - index 0: address and length of register set for PHY's common serdes - block. - - named register "dp_com" (using reg-names): address and length of the - DP_COM control block. - - For all others: - - offset and length of register set for PHY's common serdes block. +- reg: + - index 0: address and length of register set for PHY's common + serdes block. + - index 1: address and length of the DP_COM control block (for + "qcom,sdm845-qmp-usb3-phy" only). + +- reg-names: + - For "qcom,sdm845-qmp-usb3-phy": + - Should be: "reg-base", "dp_com" + - For all others: + - The reg-names property shouldn't be defined. - #clock-cells: must be 1 - Phy pll outputs a bunch of clocks for Tx, Rx and Pipe @@ -35,6 +39,7 @@ Required properties: "aux" for phy aux clock, "ref" for 19.2 MHz ref clk, "com_aux" for phy common block aux clock, + "ref_aux" for phy reference aux clock, For "qcom,msm8996-qmp-pcie-phy" must contain: "aux", "cfg_ahb", "ref". For "qcom,msm8996-qmp-usb3-phy" must contain: diff --git a/Documentation/devicetree/bindings/phy/rcar-gen2-phy.txt b/Documentation/devicetree/bindings/phy/rcar-gen2-phy.txt index eeb9e1874ea6..4f0879a0ca12 100644 --- a/Documentation/devicetree/bindings/phy/rcar-gen2-phy.txt +++ b/Documentation/devicetree/bindings/phy/rcar-gen2-phy.txt @@ -5,6 +5,7 @@ This file provides information on what the device node for the R-Car generation Required properties: - compatible: "renesas,usb-phy-r8a7743" if the device is a part of R8A7743 SoC. + "renesas,usb-phy-r8a7744" if the device is a part of R8A7744 SoC. "renesas,usb-phy-r8a7745" if the device is a part of R8A7745 SoC. "renesas,usb-phy-r8a7790" if the device is a part of R8A7790 SoC. "renesas,usb-phy-r8a7791" if the device is a part of R8A7791 SoC. diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt index fb4a204da2bf..de7b5393c163 100644 --- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt +++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt @@ -1,10 +1,12 @@ * Renesas R-Car generation 3 USB 2.0 PHY This file provides information on what the device node for the R-Car generation -3 USB 2.0 PHY contains. +3 and RZ/G2 USB 2.0 PHY contain. Required properties: -- compatible: "renesas,usb2-phy-r8a7795" if the device is a part of an R8A7795 +- compatible: "renesas,usb2-phy-r8a774a1" if the device is a part of an R8A774A1 + SoC. + "renesas,usb2-phy-r8a7795" if the device is a part of an R8A7795 SoC. "renesas,usb2-phy-r8a7796" if the device is a part of an R8A7796 SoC. @@ -14,7 +16,8 @@ Required properties: R8A77990 SoC. "renesas,usb2-phy-r8a77995" if the device is a part of an R8A77995 SoC. - "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3 compatible device. + "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3 or RZ/G2 + compatible device. When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first @@ -31,6 +34,8 @@ channel as USB OTG: - interrupts: interrupt specifier for the PHY. - vbus-supply: Phandle to a regulator that provides power to the VBUS. This regulator will be managed during the PHY power on/off sequence. +- renesas,no-otg-pins: boolean, specify when a board does not provide proper + otg pins. Example (R-Car H3): diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb3.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb3.txt index 47dd296ecead..9d9826609c2f 100644 --- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb3.txt +++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb3.txt @@ -1,20 +1,22 @@ * Renesas R-Car generation 3 USB 3.0 PHY This file provides information on what the device node for the R-Car generation -3 USB 3.0 PHY contains. +3 and RZ/G2 USB 3.0 PHY contain. If you want to enable spread spectrum clock (ssc), you should use USB_EXTAL instead of USB3_CLK. However, if you don't want to these features, you don't need this driver. Required properties: -- compatible: "renesas,r8a7795-usb3-phy" if the device is a part of an R8A7795 +- compatible: "renesas,r8a774a1-usb3-phy" if the device is a part of an R8A774A1 + SoC. + "renesas,r8a7795-usb3-phy" if the device is a part of an R8A7795 SoC. "renesas,r8a7796-usb3-phy" if the device is a part of an R8A7796 SoC. "renesas,r8a77965-usb3-phy" if the device is a part of an R8A77965 SoC. - "renesas,rcar-gen3-usb3-phy" for a generic R-Car Gen3 compatible - device. + "renesas,rcar-gen3-usb3-phy" for a generic R-Car Gen3 or RZ/G2 + compatible device. When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first diff --git a/Documentation/devicetree/bindings/phy/uniphier-pcie-phy.txt b/Documentation/devicetree/bindings/phy/uniphier-pcie-phy.txt new file mode 100644 index 000000000000..1889d3b89d68 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/uniphier-pcie-phy.txt @@ -0,0 +1,31 @@ +Socionext UniPhier PCIe PHY bindings + +This describes the devicetree bindings for PHY interface built into +PCIe controller implemented on Socionext UniPhier SoCs. + +Required properties: +- compatible: Should contain one of the following: + "socionext,uniphier-ld20-pcie-phy" - for LD20 PHY + "socionext,uniphier-pxs3-pcie-phy" - for PXs3 PHY +- reg: Specifies offset and length of the register set for the device. +- #phy-cells: Must be zero. +- clocks: A phandle to the clock gate for PCIe glue layer including + this phy. +- resets: A phandle to the reset line for PCIe glue layer including + this phy. + +Optional properties: +- socionext,syscon: A phandle to system control to set configurations + for phy. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + pcie_phy: phy@66038000 { + compatible = "socionext,uniphier-ld20-pcie-phy"; + reg = <0x66038000 0x4000>; + #phy-cells = <0>; + clocks = <&sys_clk 24>; + resets = <&sys_rst 24>; + socionext,syscon = <&soc_glue>; + }; diff --git a/Documentation/devicetree/bindings/phy/uniphier-usb2-phy.txt b/Documentation/devicetree/bindings/phy/uniphier-usb2-phy.txt new file mode 100644 index 000000000000..b43b28250cc0 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/uniphier-usb2-phy.txt @@ -0,0 +1,45 @@ +Socionext UniPhier USB2 PHY + +This describes the devicetree bindings for PHY interface built into +USB2 controller implemented on Socionext UniPhier SoCs. + +Pro4 SoC has both USB2 and USB3 host controllers, however, this USB3 +controller doesn't include its own High-Speed PHY. This needs to specify +USB2 PHY instead of USB3 HS-PHY. + +Required properties: +- compatible: Should contain one of the following: + "socionext,uniphier-pro4-usb2-phy" - for Pro4 SoC + "socionext,uniphier-ld11-usb2-phy" - for LD11 SoC + +Sub-nodes: +Each PHY should be represented as a sub-node. + +Sub-nodes required properties: +- #phy-cells: Should be 0. +- reg: The number of the PHY. + +Sub-nodes optional properties: +- vbus-supply: A phandle to the regulator for USB VBUS. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + soc-glue@5f800000 { + ... + usb-phy { + compatible = "socionext,uniphier-ld11-usb2-phy"; + usb_phy0: phy@0 { + reg = <0>; + #phy-cells = <0>; + }; + ... + }; + }; + + usb@5a800100 { + compatible = "socionext,uniphier-ehci", "generic-ehci"; + ... + phy-names = "usb"; + phys = <&usb_phy0>; + }; diff --git a/Documentation/devicetree/bindings/phy/uniphier-usb3-hsphy.txt b/Documentation/devicetree/bindings/phy/uniphier-usb3-hsphy.txt new file mode 100644 index 000000000000..e8d8086a7ae9 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/uniphier-usb3-hsphy.txt @@ -0,0 +1,69 @@ +Socionext UniPhier USB3 High-Speed (HS) PHY + +This describes the devicetree bindings for PHY interfaces built into +USB3 controller implemented on Socionext UniPhier SoCs. +Although the controller includes High-Speed PHY and Super-Speed PHY, +this describes about High-Speed PHY. + +Required properties: +- compatible: Should contain one of the following: + "socionext,uniphier-pro4-usb3-hsphy" - for Pro4 SoC + "socionext,uniphier-pxs2-usb3-hsphy" - for PXs2 SoC + "socionext,uniphier-ld20-usb3-hsphy" - for LD20 SoC + "socionext,uniphier-pxs3-usb3-hsphy" - for PXs3 SoC +- reg: Specifies offset and length of the register set for the device. +- #phy-cells: Should be 0. +- clocks: A list of phandles to the clock gate for USB3 glue layer. + According to the clock-names, appropriate clocks are required. +- clock-names: Should contain the following: + "gio", "link" - for Pro4 SoC + "phy", "phy-ext", "link" - for PXs3 SoC, "phy-ext" is optional. + "phy", "link" - for others +- resets: A list of phandles to the reset control for USB3 glue layer. + According to the reset-names, appropriate resets are required. +- reset-names: Should contain the following: + "gio", "link" - for Pro4 SoC + "phy", "link" - for others + +Optional properties: +- vbus-supply: A phandle to the regulator for USB VBUS. +- nvmem-cells: Phandles to nvmem cell that contains the trimming data. + Available only for HS-PHY implemented on LD20 and PXs3, and + if unspecified, default value is used. +- nvmem-cell-names: Should be the following names, which correspond to + each nvmem-cells. + All of the 3 parameters associated with the following names are + required for each port, if any one is omitted, the trimming data + of the port will not be set at all. + "rterm", "sel_t", "hs_i" - Each cell name for phy parameters + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + + usb-glue@65b00000 { + compatible = "socionext,uniphier-ld20-dwc3-glue", + "simple-mfd"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x65b00000 0x400>; + + usb_vbus0: regulator { + ... + }; + + usb_hsphy0: hs-phy@200 { + compatible = "socionext,uniphier-ld20-usb3-hsphy"; + reg = <0x200 0x10>; + #phy-cells = <0>; + clock-names = "link", "phy"; + clocks = <&sys_clk 14>, <&sys_clk 16>; + reset-names = "link", "phy"; + resets = <&sys_rst 14>, <&sys_rst 16>; + vbus-supply = <&usb_vbus0>; + nvmem-cell-names = "rterm", "sel_t", "hs_i"; + nvmem-cells = <&usb_rterm0>, <&usb_sel_t0>, + <&usb_hs_i0>; + }; + ... + }; diff --git a/Documentation/devicetree/bindings/phy/uniphier-usb3-ssphy.txt b/Documentation/devicetree/bindings/phy/uniphier-usb3-ssphy.txt new file mode 100644 index 000000000000..490b815445e8 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/uniphier-usb3-ssphy.txt @@ -0,0 +1,57 @@ +Socionext UniPhier USB3 Super-Speed (SS) PHY + +This describes the devicetree bindings for PHY interfaces built into +USB3 controller implemented on Socionext UniPhier SoCs. +Although the controller includes High-Speed PHY and Super-Speed PHY, +this describes about Super-Speed PHY. + +Required properties: +- compatible: Should contain one of the following: + "socionext,uniphier-pro4-usb3-ssphy" - for Pro4 SoC + "socionext,uniphier-pxs2-usb3-ssphy" - for PXs2 SoC + "socionext,uniphier-ld20-usb3-ssphy" - for LD20 SoC + "socionext,uniphier-pxs3-usb3-ssphy" - for PXs3 SoC +- reg: Specifies offset and length of the register set for the device. +- #phy-cells: Should be 0. +- clocks: A list of phandles to the clock gate for USB3 glue layer. + According to the clock-names, appropriate clocks are required. +- clock-names: + "gio", "link" - for Pro4 SoC + "phy", "phy-ext", "link" - for PXs3 SoC, "phy-ext" is optional. + "phy", "link" - for others +- resets: A list of phandles to the reset control for USB3 glue layer. + According to the reset-names, appropriate resets are required. +- reset-names: + "gio", "link" - for Pro4 SoC + "phy", "link" - for others + +Optional properties: +- vbus-supply: A phandle to the regulator for USB VBUS. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + + usb-glue@65b00000 { + compatible = "socionext,uniphier-ld20-dwc3-glue", + "simple-mfd"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x65b00000 0x400>; + + usb_vbus0: regulator { + ... + }; + + usb_ssphy0: ss-phy@300 { + compatible = "socionext,uniphier-ld20-usb3-ssphy"; + reg = <0x300 0x10>; + #phy-cells = <0>; + clock-names = "link", "phy"; + clocks = <&sys_clk 14>, <&sys_clk 16>; + reset-names = "link", "phy"; + resets = <&sys_rst 14>, <&sys_rst 16>; + vbus-supply = <&usb_vbus0>; + }; + ... + }; diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm4708-pinmux.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm4708-pinmux.txt new file mode 100644 index 000000000000..4fa9539070cb --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm4708-pinmux.txt @@ -0,0 +1,57 @@ +Broadcom Northstar pins mux controller + +Some of Northstar SoCs's pins can be used for various purposes thanks to the mux +controller. This binding allows describing mux controller and listing available +functions. They can be referenced later by other bindings to let system +configure controller correctly. + +A list of pins varies across chipsets so few bindings are available. + +Required properties: +- compatible: must be one of: + "brcm,bcm4708-pinmux" + "brcm,bcm4709-pinmux" + "brcm,bcm53012-pinmux" +- reg: iomem address range of CRU (Central Resource Unit) pin registers +- reg-names: "cru_gpio_control" - the only needed & supported reg right now + +Functions and their groups available for all chipsets: +- "spi": "spi_grp" +- "i2c": "i2c_grp" +- "pwm": "pwm0_grp", "pwm1_grp", "pwm2_grp", "pwm3_grp" +- "uart1": "uart1_grp" + +Additionally available on BCM4709 and BCM53012: +- "mdio": "mdio_grp" +- "uart2": "uart2_grp" +- "sdio": "sdio_pwr_grp", "sdio_1p8v_grp" + +For documentation of subnodes see: +Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt + +Example: + dmu@1800c000 { + compatible = "simple-bus"; + ranges = <0 0x1800c000 0x1000>; + #address-cells = <1>; + #size-cells = <1>; + + cru@100 { + compatible = "simple-bus"; + reg = <0x100 0x1a4>; + ranges; + #address-cells = <1>; + #size-cells = <1>; + + pin-controller@1c0 { + compatible = "brcm,bcm4708-pinmux"; + reg = <0x1c0 0x24>; + reg-names = "cru_gpio_control"; + + spi-pins { + function = "spi"; + groups = "spi_grp"; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/ingenic,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/ingenic,pinctrl.txt index ca313a7aeaff..af20b0ec715c 100644 --- a/Documentation/devicetree/bindings/pinctrl/ingenic,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/ingenic,pinctrl.txt @@ -20,16 +20,30 @@ Required properties: - compatible: One of: - "ingenic,jz4740-pinctrl" + - "ingenic,jz4725b-pinctrl" - "ingenic,jz4770-pinctrl" - "ingenic,jz4780-pinctrl" - reg: Address range of the pinctrl registers. -GPIO sub-nodes --------------- +Required properties for sub-nodes (GPIO chips): +----------------------------------------------- -The pinctrl node can have optional sub-nodes for the Ingenic GPIO driver; -please refer to ../gpio/ingenic,gpio.txt. + - compatible: Must contain one of: + - "ingenic,jz4740-gpio" + - "ingenic,jz4770-gpio" + - "ingenic,jz4780-gpio" + - reg: The GPIO bank number. + - interrupt-controller: Marks the device node as an interrupt controller. + - interrupts: Interrupt specifier for the controllers interrupt. + - #interrupt-cells: Should be 2. Refer to + ../interrupt-controller/interrupts.txt for more details. + - gpio-controller: Marks the device node as a GPIO controller. + - #gpio-cells: Should be 2. The first cell is the GPIO number and the second + cell specifies GPIO flags, as defined in . Only the + GPIO_ACTIVE_HIGH and GPIO_ACTIVE_LOW flags are supported. + - gpio-ranges: Range of pins managed by the GPIO controller. Refer to + ../gpio/gpio.txt for more details. Example: @@ -38,4 +52,21 @@ Example: pinctrl: pin-controller@10010000 { compatible = "ingenic,jz4740-pinctrl"; reg = <0x10010000 0x400>; + #address-cells = <1>; + #size-cells = <0>; + + gpa: gpio@0 { + compatible = "ingenic,jz4740-gpio"; + reg = <0>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 0 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <28>; + }; }; diff --git a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt index 54ecb8ab7788..82ead40311f6 100644 --- a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt @@ -13,6 +13,8 @@ Required properties for the root node: "amlogic,meson-gxl-aobus-pinctrl" "amlogic,meson-axg-periphs-pinctrl" "amlogic,meson-axg-aobus-pinctrl" + "amlogic,meson-g12a-periphs-pinctrl" + "amlogic,meson-g12a-aobus-pinctrl" - reg: address and size of registers controlling irq functionality === GPIO sub-nodes === diff --git a/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt new file mode 100644 index 000000000000..83f4bbac94bb --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt @@ -0,0 +1,216 @@ +Nuvoton NPCM7XX Pin Controllers + +The Nuvoton BMC NPCM7XX Pin Controller multi-function routed through +the multiplexing block, Each pin supports GPIO functionality (GPIOx) +and multiple functions that directly connect the pin to different +hardware blocks. + +Required properties: +- #address-cells : should be 1. +- #size-cells : should be 1. +- compatible : "nuvoton,npcm750-pinctrl" for Poleg NPCM7XX. +- ranges : defines mapping ranges between pin controller node (parent) + to GPIO bank node (children). + +=== GPIO Bank Subnode === + +The NPCM7XX has 8 GPIO Banks each GPIO bank supports 32 GPIO. + +Required GPIO Bank subnode-properties: +- reg : specifies physical base address and size of the GPIO + bank registers. +- gpio-controller : Marks the device node as a GPIO controller. +- #gpio-cells : Must be <2>. The first cell is the gpio pin number + and the second cell is used for optional parameters. +- interrupts : contain the GPIO bank interrupt with flags for falling edge. +- gpio-ranges : defines the range of pins managed by the GPIO bank controller. + +For example, GPIO bank subnodes like the following: + gpio0: gpio@f0010000 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x0 0x80>; + interrupts = ; + gpio-ranges = <&pinctrl 0 0 32>; + }; + +=== Pin Mux Subnode === + +- pin: A string containing the name of the pin + An array of strings, each string containing the name of a pin. + These pin are used for selecting pin configuration. + +The following are the list of pins available: + "GPIO0/IOX1DI", "GPIO1/IOX1LD", "GPIO2/IOX1CK", "GPIO3/IOX1D0", + "GPIO4/IOX2DI/SMB1DSDA", "GPIO5/IOX2LD/SMB1DSCL", "GPIO6/IOX2CK/SMB2DSDA", + "GPIO7/IOX2D0/SMB2DSCL", "GPIO8/LKGPO1", "GPIO9/LKGPO2", "GPIO10/IOXHLD", + "GPIO11/IOXHCK", "GPIO12/GSPICK/SMB5BSCL", "GPIO13/GSPIDO/SMB5BSDA", + "GPIO14/GSPIDI/SMB5CSCL", "GPIO15/GSPICS/SMB5CSDA", "GPIO16/LKGPO0", + "GPIO17/PSPI2DI/SMB4DEN","GPIO18/PSPI2D0/SMB4BSDA", "GPIO19/PSPI2CK/SMB4BSCL", + "GPIO20/SMB4CSDA/SMB15SDA", "GPIO21/SMB4CSCL/SMB15SCL", "GPIO22/SMB4DSDA/SMB14SDA", + "GPIO23/SMB4DSCL/SMB14SCL", "GPIO24/IOXHDO", "GPIO25/IOXHDI", "GPIO26/SMB5SDA", + "GPIO27/SMB5SCL", "GPIO28/SMB4SDA", "GPIO29/SMB4SCL", "GPIO30/SMB3SDA", + "GPIO31/SMB3SCL", "GPIO32/nSPI0CS1","SPI0D2", "SPI0D3", "GPIO37/SMB3CSDA", + "GPIO38/SMB3CSCL", "GPIO39/SMB3BSDA", "GPIO40/SMB3BSCL", "GPIO41/BSPRXD", + "GPO42/BSPTXD/STRAP11", "GPIO43/RXD1/JTMS2/BU1RXD", "GPIO44/nCTS1/JTDI2/BU1CTS", + "GPIO45/nDCD1/JTDO2", "GPIO46/nDSR1/JTCK2", "GPIO47/nRI1/JCP_RDY2", + "GPIO48/TXD2/BSPTXD", "GPIO49/RXD2/BSPRXD", "GPIO50/nCTS2", "GPO51/nRTS2/STRAP2", + "GPIO52/nDCD2", "GPO53/nDTR2_BOUT2/STRAP1", "GPIO54/nDSR2", "GPIO55/nRI2", + "GPIO56/R1RXERR", "GPIO57/R1MDC", "GPIO58/R1MDIO", "GPIO59/SMB3DSDA", + "GPIO60/SMB3DSCL", "GPO61/nDTR1_BOUT1/STRAP6", "GPO62/nRTST1/STRAP5", + "GPO63/TXD1/STRAP4", "GPIO64/FANIN0", "GPIO65/FANIN1", "GPIO66/FANIN2", + "GPIO67/FANIN3", "GPIO68/FANIN4", "GPIO69/FANIN5", "GPIO70/FANIN6", "GPIO71/FANIN7", + "GPIO72/FANIN8", "GPIO73/FANIN9", "GPIO74/FANIN10", "GPIO75/FANIN11", + "GPIO76/FANIN12", "GPIO77/FANIN13","GPIO78/FANIN14", "GPIO79/FANIN15", + "GPIO80/PWM0", "GPIO81/PWM1", "GPIO82/PWM2", "GPIO83/PWM3", "GPIO84/R2TXD0", + "GPIO85/R2TXD1", "GPIO86/R2TXEN", "GPIO87/R2RXD0", "GPIO88/R2RXD1", "GPIO89/R2CRSDV", + "GPIO90/R2RXERR", "GPIO91/R2MDC", "GPIO92/R2MDIO", "GPIO93/GA20/SMB5DSCL", + "GPIO94/nKBRST/SMB5DSDA", "GPIO95/nLRESET/nESPIRST", "GPIO96/RG1TXD0", + "GPIO97/RG1TXD1", "GPIO98/RG1TXD2", "GPIO99/RG1TXD3","GPIO100/RG1TXC", + "GPIO101/RG1TXCTL", "GPIO102/RG1RXD0", "GPIO103/RG1RXD1", "GPIO104/RG1RXD2", + "GPIO105/RG1RXD3", "GPIO106/RG1RXC", "GPIO107/RG1RXCTL", "GPIO108/RG1MDC", + "GPIO109/RG1MDIO", "GPIO110/RG2TXD0/DDRV0", "GPIO111/RG2TXD1/DDRV1", + "GPIO112/RG2TXD2/DDRV2", "GPIO113/RG2TXD3/DDRV3", "GPIO114/SMB0SCL", + "GPIO115/SMB0SDA", "GPIO116/SMB1SCL", "GPIO117/SMB1SDA", "GPIO118/SMB2SCL", + "GPIO119/SMB2SDA", "GPIO120/SMB2CSDA", "GPIO121/SMB2CSCL", "GPIO122/SMB2BSDA", + "GPIO123/SMB2BSCL", "GPIO124/SMB1CSDA", "GPIO125/SMB1CSCL","GPIO126/SMB1BSDA", + "GPIO127/SMB1BSCL", "GPIO128/SMB8SCL", "GPIO129/SMB8SDA", "GPIO130/SMB9SCL", + "GPIO131/SMB9SDA", "GPIO132/SMB10SCL", "GPIO133/SMB10SDA","GPIO134/SMB11SCL", + "GPIO135/SMB11SDA", "GPIO136/SD1DT0", "GPIO137/SD1DT1", "GPIO138/SD1DT2", + "GPIO139/SD1DT3", "GPIO140/SD1CLK", "GPIO141/SD1WP", "GPIO142/SD1CMD", + "GPIO143/SD1CD/SD1PWR", "GPIO144/PWM4", "GPIO145/PWM5", "GPIO146/PWM6", + "GPIO147/PWM7", "GPIO148/MMCDT4", "GPIO149/MMCDT5", "GPIO150/MMCDT6", + "GPIO151/MMCDT7", "GPIO152/MMCCLK", "GPIO153/MMCWP", "GPIO154/MMCCMD", + "GPIO155/nMMCCD/nMMCRST", "GPIO156/MMCDT0", "GPIO157/MMCDT1", "GPIO158/MMCDT2", + "GPIO159/MMCDT3", "GPIO160/CLKOUT/RNGOSCOUT", "GPIO161/nLFRAME/nESPICS", + "GPIO162/SERIRQ", "GPIO163/LCLK/ESPICLK", "GPIO164/LAD0/ESPI_IO0", + "GPIO165/LAD1/ESPI_IO1", "GPIO166/LAD2/ESPI_IO2", "GPIO167/LAD3/ESPI_IO3", + "GPIO168/nCLKRUN/nESPIALERT", "GPIO169/nSCIPME", "GPIO170/nSMI", "GPIO171/SMB6SCL", + "GPIO172/SMB6SDA", "GPIO173/SMB7SCL", "GPIO174/SMB7SDA", "GPIO175/PSPI1CK/FANIN19", + "GPIO176/PSPI1DO/FANIN18", "GPIO177/PSPI1DI/FANIN17", "GPIO178/R1TXD0", + "GPIO179/R1TXD1", "GPIO180/R1TXEN", "GPIO181/R1RXD0", "GPIO182/R1RXD1", + "GPIO183/SPI3CK", "GPO184/SPI3D0/STRAP9", "GPO185/SPI3D1/STRAP10", + "GPIO186/nSPI3CS0", "GPIO187/nSPI3CS1", "GPIO188/SPI3D2/nSPI3CS2", + "GPIO189/SPI3D3/nSPI3CS3", "GPIO190/nPRD_SMI", "GPIO191", "GPIO192", "GPIO193/R1CRSDV", + "GPIO194/SMB0BSCL", "GPIO195/SMB0BSDA", "GPIO196/SMB0CSCL", "GPIO197/SMB0DEN", + "GPIO198/SMB0DSDA", "GPIO199/SMB0DSCL", "GPIO200/R2CK", "GPIO201/R1CK", + "GPIO202/SMB0CSDA", "GPIO203/FANIN16", "GPIO204/DDC2SCL", "GPIO205/DDC2SDA", + "GPIO206/HSYNC2", "GPIO207/VSYNC2", "GPIO208/RG2TXC/DVCK", "GPIO209/RG2TXCTL/DDRV4", + "GPIO210/RG2RXD0/DDRV5", "GPIO211/RG2RXD1/DDRV6", "GPIO212/RG2RXD2/DDRV7", + "GPIO213/RG2RXD3/DDRV8", "GPIO214/RG2RXC/DDRV9", "GPIO215/RG2RXCTL/DDRV10", + "GPIO216/RG2MDC/DDRV11", "GPIO217/RG2MDIO/DVHSYNC", "GPIO218/nWDO1", + "GPIO219/nWDO2", "GPIO220/SMB12SCL", "GPIO221/SMB12SDA", "GPIO222/SMB13SCL", + "GPIO223/SMB13SDA", "GPIO224/SPIXCK", "GPO225/SPIXD0/STRAP12", "GPO226/SPIXD1/STRAP13", + "GPIO227/nSPIXCS0", "GPIO228/nSPIXCS1", "GPO229/SPIXD2/STRAP3", "GPIO230/SPIXD3", + "GPIO231/nCLKREQ", "GPI255/DACOSEL" + +Optional Properties: + bias-disable, bias-pull-down, bias-pull-up, input-enable, + input-disable, output-high, output-low, drive-push-pull, + drive-open-drain, input-debounce, slew-rate, drive-strength + + slew-rate valid arguments are: + <0> - slow + <1> - fast + drive-strength valid arguments are: + <2> - 2mA + <4> - 4mA + <8> - 8mA + <12> - 12mA + <16> - 16mA + <24> - 24mA + +For example, pinctrl might have pinmux subnodes like the following: + + gpio0_iox1d1_pin: gpio0-iox1d1-pin { + pins = "GPIO0/IOX1DI"; + output-high; + }; + gpio0_iox1ck_pin: gpio0-iox1ck-pin { + pins = "GPIO2/IOX1CK"; + output_high; + }; + +=== Pin Group Subnode === + +Required pin group subnode-properties: +- groups : A string containing the name of the group to mux. +- function: A string containing the name of the function to mux to the + group. + +The following are the list of the available groups and functions : + smb0, smb0b, smb0c, smb0d, smb0den, smb1, smb1b, smb1c, smb1d, + smb2, smb2b, smb2c, smb2d, smb3, smb3b, smb3c, smb3d, smb4, smb4b, + smb4c, smb4d, smb4den, smb5, smb5b, smb5c, smb5d, ga20kbc, smb6, + smb7, smb8, smb9, smb10, smb11, smb12, smb13, smb14, smb15, fanin0, + fanin1, fanin2, fanin3, fanin4, fanin5, fanin6, fanin7, fanin8, + fanin9, fanin10, fanin11 fanin12 fanin13, fanin14, fanin15, faninx, + pwm0, pwm1, pwm2, pwm3, pwm4, pwm5, pwm6, pwm7, rg1, rg1mdio, rg2, + rg2mdio, ddr, uart1, uart2, bmcuart0a, bmcuart0b, bmcuart1, iox1, + iox2, ioxh, gspi, mmc, mmcwp, mmccd, mmcrst, mmc8, r1, r1err, r1md, + r2, r2err, r2md, sd1, sd1pwr, wdog1, wdog2, scipme, sci, serirq, + jtag2, spix, spixcs1, pspi1, pspi2, ddc, clkreq, clkout, spi3, spi3cs1, + spi3quad, spi3cs2, spi3cs3, spi0cs1, lpc, lpcclk, espi, lkgpo0, lkgpo1, + lkgpo2, nprd_smi + +For example, pinctrl might have group subnodes like the following: + r1err_pins: r1err-pins { + groups = "r1err"; + function = "r1err"; + }; + r1md_pins: r1md-pins { + groups = "r1md"; + function = "r1md"; + }; + r1_pins: r1-pins { + groups = "r1"; + function = "r1"; + }; + +Examples +======== +pinctrl: pinctrl@f0800000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "nuvoton,npcm750-pinctrl"; + ranges = <0 0xf0010000 0x8000>; + + gpio0: gpio@f0010000 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x0 0x80>; + interrupts = ; + gpio-ranges = <&pinctrl 0 0 32>; + }; + + .... + + gpio7: gpio@f0017000 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x7000 0x80>; + interrupts = ; + gpio-ranges = <&pinctrl 0 224 32>; + }; + + gpio0_iox1d1_pin: gpio0-iox1d1-pin { + pins = "GPIO0/IOX1DI"; + output-high; + }; + + iox1_pins: iox1-pins { + groups = "iox1"; + function = "iox1"; + }; + iox2_pins: iox2-pins { + groups = "iox2"; + function = "iox2"; + }; + + .... + + clkreq_pins: clkreq-pins { + groups = "clkreq"; + function = "clkreq"; + }; +}; \ No newline at end of file diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt index ffd4345415f3..ab4000eab07d 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt @@ -19,6 +19,7 @@ PMIC's from Qualcomm. "qcom,pm8998-gpio" "qcom,pma8084-gpio" "qcom,pmi8994-gpio" + "qcom,pms405-gpio" And must contain either "qcom,spmi-gpio" or "qcom,ssbi-gpio" if the device is on an spmi bus or an ssbi bus respectively @@ -91,6 +92,7 @@ to specify in a pin configuration subnode: gpio1-gpio26 for pm8998 gpio1-gpio22 for pma8084 gpio1-gpio10 for pmi8994 + gpio1-gpio11 for pms405 - function: Usage: required diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt new file mode 100644 index 000000000000..2b8f77762edc --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt @@ -0,0 +1,199 @@ +Qualcomm QCS404 TLMM block + +This binding describes the Top Level Mode Multiplexer block found in the +QCS404 platform. + +- compatible: + Usage: required + Value type: + Definition: must be "qcom,qcs404-pinctrl" + +- reg: + Usage: required + Value type: + Definition: the base address and size of the north, south and east TLMM + tiles. + +- reg-names: + Usage: required + Value type: + Defintiion: names for the cells of reg, must contain "north", "south" + and "east". + +- interrupts: + Usage: required + Value type: + Definition: should specify the TLMM summary IRQ. + +- interrupt-controller: + Usage: required + Value type: + Definition: identifies this node as an interrupt controller + +- #interrupt-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +- gpio-controller: + Usage: required + Value type: + Definition: identifies this node as a gpio controller + +- #gpio-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +- gpio-ranges: + Usage: required + Definition: see ../gpio/gpio.txt + +Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for +a general description of GPIO and interrupt bindings. + +Please refer to pinctrl-bindings.txt in this directory for details of the +common pinctrl bindings used by client devices, including the meaning of the +phrase "pin configuration node". + +The pin configuration nodes act as a container for an arbitrary number of +subnodes. Each of these subnodes represents some desired configuration for a +pin, a group, or a list of pins or groups. This configuration can include the +mux function to select on those pin(s)/group(s), and various pin configuration +parameters, such as pull-up, drive strength, etc. + + +PIN CONFIGURATION NODES: + +The name of each subnode is not important; all subnodes should be enumerated +and processed purely based on their content. + +Each subnode only affects those parameters that are explicitly listed. In +other words, a subnode that lists a mux function but no pin configuration +parameters implies no information about any pin configuration parameters. +Similarly, a pin subnode that describes a pullup parameter implies no +information about e.g. the mux function. + + +The following generic properties as defined in pinctrl-bindings.txt are valid +to specify in a pin configuration subnode: + +- pins: + Usage: required + Value type: + Definition: List of gpio pins affected by the properties specified in + this subnode. + + Valid pins are: + gpio0-gpio119 + Supports mux, bias and drive-strength + + sdc1_clk, sdc1_cmd, sdc1_data, sdc2_clk, sdc2_cmd, + sdc2_data + Supports bias and drive-strength + + ufs_reset + Supports bias and drive-strength + +- function: + Usage: required + Value type: + Definition: Specify the alternative function to be configured for the + specified pins. Functions are only valid for gpio pins. + Valid values are: + + gpio, hdmi_tx, hdmi_ddc, blsp_uart_tx_a2, blsp_spi2, m_voc, + qdss_cti_trig_in_a0, blsp_uart_rx_a2, qdss_tracectl_a, + blsp_uart2, aud_cdc, blsp_i2c_sda_a2, qdss_tracedata_a, + blsp_i2c_scl_a2, qdss_tracectl_b, qdss_cti_trig_in_b0, + blsp_uart1, blsp_spi_mosi_a1, blsp_spi_miso_a1, + qdss_tracedata_b, blsp_i2c1, blsp_spi_cs_n_a1, gcc_plltest, + blsp_spi_clk_a1, rgb_data0, blsp_uart5, blsp_spi5, + adsp_ext, rgb_data1, prng_rosc, rgb_data2, blsp_i2c5, + gcc_gp1_clk_b, rgb_data3, gcc_gp2_clk_b, blsp_spi0, + blsp_uart0, gcc_gp3_clk_b, blsp_i2c0, qdss_traceclk_b, + pcie_clk, nfc_irq, blsp_spi4, nfc_dwl, audio_ts, rgb_data4, + spi_lcd, blsp_uart_tx_b2, gcc_gp3_clk_a, rgb_data5, + blsp_uart_rx_b2, blsp_i2c_sda_b2, blsp_i2c_scl_b2, + pwm_led11, i2s_3_data0_a, ebi2_lcd, i2s_3_data1_a, + i2s_3_data2_a, atest_char, pwm_led3, i2s_3_data3_a, + pwm_led4, i2s_4, ebi2_a, dsd_clk_b, pwm_led5, pwm_led6, + pwm_led7, pwm_led8, pwm_led24, spkr_dac0, blsp_i2c4, + pwm_led9, pwm_led10, spdifrx_opt, pwm_led12, pwm_led13, + pwm_led14, wlan1_adc1, rgb_data_b0, pwm_led15, + blsp_spi_mosi_b1, wlan1_adc0, rgb_data_b1, pwm_led16, + blsp_spi_miso_b1, qdss_cti_trig_out_b0, wlan2_adc1, + rgb_data_b2, pwm_led17, blsp_spi_cs_n_b1, wlan2_adc0, + rgb_data_b3, pwm_led18, blsp_spi_clk_b1, rgb_data_b4, + pwm_led19, ext_mclk1_b, qdss_traceclk_a, rgb_data_b5, + pwm_led20, atest_char3, i2s_3_sck_b, ldo_update, bimc_dte0, + rgb_hsync, pwm_led21, i2s_3_ws_b, dbg_out, rgb_vsync, + i2s_3_data0_b, ldo_en, hdmi_dtest, rgb_de, i2s_3_data1_b, + hdmi_lbk9, rgb_clk, atest_char1, i2s_3_data2_b, ebi_cdc, + hdmi_lbk8, rgb_mdp, atest_char0, i2s_3_data3_b, hdmi_lbk7, + rgb_data_b6, rgb_data_b7, hdmi_lbk6, rgmii_int, cri_trng1, + rgmii_wol, cri_trng0, gcc_tlmm, rgmii_ck, rgmii_tx, + hdmi_lbk5, hdmi_pixel, hdmi_rcv, hdmi_lbk4, rgmii_ctl, + ext_lpass, rgmii_rx, cri_trng, hdmi_lbk3, hdmi_lbk2, + qdss_cti_trig_out_b1, rgmii_mdio, hdmi_lbk1, rgmii_mdc, + hdmi_lbk0, ir_in, wsa_en, rgb_data6, rgb_data7, + atest_char2, ebi_ch0, blsp_uart3, blsp_spi3, sd_write, + blsp_i2c3, gcc_gp1_clk_a, qdss_cti_trig_in_b1, + gcc_gp2_clk_a, ext_mclk0, mclk_in1, i2s_1, dsd_clk_a, + qdss_cti_trig_in_a1, rgmi_dll1, pwm_led22, pwm_led23, + qdss_cti_trig_out_a0, rgmi_dll2, pwm_led1, + qdss_cti_trig_out_a1, pwm_led2, i2s_2, pll_bist, + ext_mclk1_a, mclk_in2, bimc_dte1, i2s_3_sck_a, i2s_3_ws_a + +- bias-disable: + Usage: optional + Value type: + Definition: The specified pins should be configued as no pull. + +- bias-pull-down: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull down. + +- bias-pull-up: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull up. + +- output-high: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + high. + Not valid for sdc pins. + +- output-low: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + low. + Not valid for sdc pins. + +- drive-strength: + Usage: optional + Value type: + Definition: Selects the drive strength for the specified pins, in mA. + Valid values are: 2, 4, 6, 8, 10, 12, 14 and 16 + +Example: + + tlmm: pinctrl@1000000 { + compatible = "qcom,qcs404-pinctrl"; + reg = <0x01000000 0x200000>, + <0x01300000 0x200000>, + <0x07b00000 0x200000>; + reg-names = "south", "north", "east"; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&tlmm 0 0 120>; + interrupt-controller; + #interrupt-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt new file mode 100644 index 000000000000..769ca83bb40d --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt @@ -0,0 +1,191 @@ +Qualcomm Technologies, Inc. SDM660 TLMM block + +This binding describes the Top Level Mode Multiplexer block found in the +SDM660 platform. + +- compatible: + Usage: required + Value type: + Definition: must be "qcom,sdm660-pinctrl" or + "qcom,sdm630-pinctrl". + +- reg: + Usage: required + Value type: + Definition: the base address and size of the north, center and south + TLMM tiles. + +- reg-names: + Usage: required + Value type: + Definition: names for the cells of reg, must contain "north", "center" + and "south". + +- interrupts: + Usage: required + Value type: + Definition: should specify the TLMM summary IRQ. + +- interrupt-controller: + Usage: required + Value type: + Definition: identifies this node as an interrupt controller + +- #interrupt-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +- gpio-controller: + Usage: required + Value type: + Definition: identifies this node as a gpio controller + +- gpio-ranges: + Usage: required + Value type: + Definition: Specifies the mapping between gpio controller and + pin-controller pins. + +- #gpio-cells: + Usage: required + Value type: + Definition: must be 2. Specifying the pin number and flags, as defined + in + +Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for +a general description of GPIO and interrupt bindings. + +Please refer to pinctrl-bindings.txt in this directory for details of the +common pinctrl bindings used by client devices, including the meaning of the +phrase "pin configuration node". + +The pin configuration nodes act as a container for an arbitrary number of +subnodes. Each of these subnodes represents some desired configuration for a +pin, a group, or a list of pins or groups. This configuration can include the +mux function to select on those pin(s)/group(s), and various pin configuration +parameters, such as pull-up, drive strength, etc. + + +PIN CONFIGURATION NODES: + +The name of each subnode is not important; all subnodes should be enumerated +and processed purely based on their content. + +Each subnode only affects those parameters that are explicitly listed. In +other words, a subnode that lists a mux function but no pin configuration +parameters implies no information about any pin configuration parameters. +Similarly, a pin subnode that describes a pullup parameter implies no +information about e.g. the mux function. + + +The following generic properties as defined in pinctrl-bindings.txt are valid +to specify in a pin configuration subnode: + +- pins: + Usage: required + Value type: + Definition: List of gpio pins affected by the properties specified in + this subnode. Valid pins are: + gpio0-gpio113, + Supports mux, bias and drive-strength + sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd, sdc2_data sdc1_rclk, + Supports bias and drive-strength + +- function: + Usage: required + Value type: + Definition: Specify the alternative function to be configured for the + specified pins. Functions are only valid for gpio pins. + Valid values are: + adsp_ext, agera_pll, atest_char, atest_char0, atest_char1, + atest_char2, atest_char3, atest_gpsadc0, atest_gpsadc1, + atest_tsens, atest_tsens2, atest_usb1, atest_usb10, + atest_usb11, atest_usb12, atest_usb13, atest_usb2, + atest_usb20, atest_usb21, atest_usb22, atest_usb23, + audio_ref, bimc_dte0, bimc_dte1, blsp_i2c1, blsp_i2c2, + blsp_i2c3, blsp_i2c4, blsp_i2c5, blsp_i2c6, blsp_i2c7, + blsp_i2c8_a, blsp_i2c8_b, blsp_spi1, blsp_spi2, blsp_spi3, + blsp_spi3_cs1, blsp_spi3_cs2, blsp_spi4, blsp_spi5, + blsp_spi6, blsp_spi7, blsp_spi8_a, blsp_spi8_b, + blsp_spi8_cs1, blsp_spi8_cs2, blsp_uart1, blsp_uart2, + blsp_uart5, blsp_uart6_a, blsp_uart6_b, blsp_uim1, + blsp_uim2, blsp_uim5, blsp_uim6, cam_mclk, cci_async, + cci_i2c, cri_trng, cri_trng0, cri_trng1, dbg_out, ddr_bist, + gcc_gp1, gcc_gp2, gcc_gp3, gpio, gps_tx_a, gps_tx_b, gps_tx_c, + isense_dbg, jitter_bist, ldo_en, ldo_update, m_voc, mdp_vsync, + mdss_vsync0, mdss_vsync1, mdss_vsync2, mdss_vsync3, mss_lte, + nav_pps_a, nav_pps_b, nav_pps_c, pa_indicator, phase_flag0, + phase_flag1, phase_flag10, phase_flag11, phase_flag12, + phase_flag13, phase_flag14, phase_flag15, phase_flag16, + phase_flag17, phase_flag18, phase_flag19, phase_flag2, + phase_flag20, phase_flag21, phase_flag22, phase_flag23, + phase_flag24, phase_flag25, phase_flag26, phase_flag27, + phase_flag28, phase_flag29, phase_flag3, phase_flag30, + phase_flag31, phase_flag4, phase_flag5, phase_flag6, + phase_flag7, phase_flag8, phase_flag9, pll_bypassnl, + pll_reset, pri_mi2s, pri_mi2s_ws, prng_rosc, pwr_crypto, + pwr_modem, pwr_nav, qdss_cti0_a, qdss_cti0_b, qdss_cti1_a, + qdss_cti1_b, qdss_gpio, qdss_gpio0, qdss_gpio1, qdss_gpio10, + qdss_gpio11, qdss_gpio12, qdss_gpio13, qdss_gpio14, qdss_gpio15, + qdss_gpio2, qdss_gpio3, qdss_gpio4, qdss_gpio5, qdss_gpio6, + qdss_gpio7, qdss_gpio8, qdss_gpio9, qlink_enable, qlink_request, + qspi_clk, qspi_cs, qspi_data0, qspi_data1, qspi_data2, + qspi_data3, qspi_resetn, sec_mi2s, sndwire_clk, sndwire_data, + sp_cmu, ssc_irq, tgu_ch0, tgu_ch1, tsense_pwm1, tsense_pwm2, + uim1_clk, uim1_data, uim1_present, uim1_reset, uim2_clk, + uim2_data, uim2_present, uim2_reset, uim_batt, vfr_1, + vsense_clkout, vsense_data0, vsense_data1, vsense_mode, + wlan1_adc0, wlan1_adc1, wlan2_adc0, wlan2_adc1 + +- bias-disable: + Usage: optional + Value type: + Definition: The specified pins should be configued as no pull. + +- bias-pull-down: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull down. + +- bias-pull-up: + Usage: optional + Value type: + Definition: The specified pins should be configued as pull up. + +- output-high: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + high. + Not valid for sdc pins. + +- output-low: + Usage: optional + Value type: + Definition: The specified pins are configured in output mode, driven + low. + Not valid for sdc pins. + +- drive-strength: + Usage: optional + Value type: + Definition: Selects the drive strength for the specified pins, in mA. + Valid values are: 2, 4, 6, 8, 10, 12, 14 and 16 + +Example: + + tlmm: pinctrl@3100000 { + compatible = "qcom,sdm660-pinctrl"; + reg = <0x3100000 0x200000>, + <0x3500000 0x200000>, + <0x3900000 0x200000>; + reg-names = "south", "center", "north"; + interrupts = ; + gpio-controller; + gpio-ranges = <&tlmm 0 0 114>; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,pfc-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/renesas,pfc-pinctrl.txt index abd8fbcf1e62..3902efa18fd0 100644 --- a/Documentation/devicetree/bindings/pinctrl/renesas,pfc-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/renesas,pfc-pinctrl.txt @@ -14,8 +14,11 @@ Required Properties: - "renesas,pfc-r8a73a4": for R8A73A4 (R-Mobile APE6) compatible pin-controller. - "renesas,pfc-r8a7740": for R8A7740 (R-Mobile A1) compatible pin-controller. - "renesas,pfc-r8a7743": for R8A7743 (RZ/G1M) compatible pin-controller. + - "renesas,pfc-r8a7744": for R8A7744 (RZ/G1N) compatible pin-controller. - "renesas,pfc-r8a7745": for R8A7745 (RZ/G1E) compatible pin-controller. - "renesas,pfc-r8a77470": for R8A77470 (RZ/G1C) compatible pin-controller. + - "renesas,pfc-r8a774a1": for R8A774A1 (RZ/G2M) compatible pin-controller. + - "renesas,pfc-r8a774c0": for R8A774C0 (RZ/G2E) compatible pin-controller. - "renesas,pfc-r8a7778": for R8A7778 (R-Car M1) compatible pin-controller. - "renesas,pfc-r8a7779": for R8A7779 (R-Car H1) compatible pin-controller. - "renesas,pfc-r8a7790": for R8A7790 (R-Car H2) compatible pin-controller. diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,rzn1-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/renesas,rzn1-pinctrl.txt new file mode 100644 index 000000000000..25e53acd523e --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/renesas,rzn1-pinctrl.txt @@ -0,0 +1,153 @@ +Renesas RZ/N1 SoC Pinctrl node description. + +Pin controller node +------------------- +Required properties: +- compatible: SoC-specific compatible string "renesas,-pinctrl" + followed by "renesas,rzn1-pinctrl" as fallback. The SoC-specific compatible + strings must be one of: + "renesas,r9a06g032-pinctrl" for RZ/N1D + "renesas,r9a06g033-pinctrl" for RZ/N1S +- reg: Address base and length of the memory area where the pin controller + hardware is mapped to. +- clocks: phandle for the clock, see the description of clock-names below. +- clock-names: Contains the name of the clock: + "bus", the bus clock, sometimes described as pclk, for register accesses. + +Example: + pinctrl: pin-controller@40067000 { + compatible = "renesas,r9a06g032-pinctrl", "renesas,rzn1-pinctrl"; + reg = <0x40067000 0x1000>, <0x51000000 0x480>; + clocks = <&sysctrl R9A06G032_HCLK_PINCONFIG>; + clock-names = "bus"; + }; + +Sub-nodes +--------- + +The child nodes of the pin controller node describe a pin multiplexing +function. + +- Pin multiplexing sub-nodes: + A pin multiplexing sub-node describes how to configure a set of + (or a single) pin in some desired alternate function mode. + A single sub-node may define several pin configurations. + Please refer to pinctrl-bindings.txt to get to know more on generic + pin properties usage. + + The allowed generic formats for a pin multiplexing sub-node are the + following ones: + + node-1 { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + + node-2 { + sub-node-1 { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + + sub-node-2 { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + + ... + + sub-node-n { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + }; + + node-3 { + pinmux = , , ... ; + GENERIC_PINCONFIG; + + sub-node-1 { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + + ... + + sub-node-n { + pinmux = , , ... ; + GENERIC_PINCONFIG; + }; + }; + + Use the latter two formats when pins part of the same logical group need to + have different generic pin configuration flags applied. Note that the generic + pinconfig in node-3 does not apply to the sub-nodes. + + Client sub-nodes shall refer to pin multiplexing sub-nodes using the phandle + of the most external one. + + Eg. + + client-1 { + ... + pinctrl-0 = <&node-1>; + ... + }; + + client-2 { + ... + pinctrl-0 = <&node-2>; + ... + }; + + Required properties: + - pinmux: + integer array representing pin number and pin multiplexing configuration. + When a pin has to be configured in alternate function mode, use this + property to identify the pin by its global index, and provide its + alternate function configuration number along with it. + When multiple pins are required to be configured as part of the same + alternate function they shall be specified as members of the same + argument list of a single "pinmux" property. + Integers values in the "pinmux" argument list are assembled as: + (PIN | MUX_FUNC << 8) + where PIN directly corresponds to the pl_gpio pin number and MUX_FUNC is + one of the alternate function identifiers defined in: + + These identifiers collapse the IO Multiplex Configuration Level 1 and + Level 2 numbers that are detailed in the hardware reference manual into a + single number. The identifiers for Level 2 are simply offset by 10. + Additional identifiers are provided to specify the MDIO source peripheral. + + Optional generic pinconf properties: + - bias-disable - disable any pin bias + - bias-pull-up - pull up the pin with 50 KOhm + - bias-pull-down - pull down the pin with 50 KOhm + - bias-high-impedance - high impedance mode + - drive-strength - sink or source at most 4, 6, 8 or 12 mA + + Example: + A serial communication interface with a TX output pin and an RX input pin. + + &pinctrl { + pins_uart0: pins_uart0 { + pinmux = < + RZN1_PINMUX(103, RZN1_FUNC_UART0_I) /* UART0_TXD */ + RZN1_PINMUX(104, RZN1_FUNC_UART0_I) /* UART0_RXD */ + >; + }; + }; + + Example 2: + Here we set the pull up on the RXD pin of the UART. + + &pinctrl { + pins_uart0: pins_uart0 { + pinmux = ; /* TXD */ + + pins_uart6_rx { + pinmux = ; /* RXD */ + bias-pull-up; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/power/actions,owl-sps.txt b/Documentation/devicetree/bindings/power/actions,owl-sps.txt index 78edd63641e8..a3571937b019 100644 --- a/Documentation/devicetree/bindings/power/actions,owl-sps.txt +++ b/Documentation/devicetree/bindings/power/actions,owl-sps.txt @@ -3,11 +3,13 @@ Actions Semi Owl Smart Power System (SPS) Required properties: - compatible : "actions,s500-sps" for S500 "actions,s700-sps" for S700 + "actions,s900-sps" for S900 - reg : Offset and length of the register set for the device. - #power-domain-cells : Must be 1. See macros in: include/dt-bindings/power/owl-s500-powergate.h for S500 include/dt-bindings/power/owl-s700-powergate.h for S700 + include/dt-bindings/power/owl-s900-powergate.h for S900 Example: diff --git a/Documentation/devicetree/bindings/power/renesas,apmu.txt b/Documentation/devicetree/bindings/power/renesas,apmu.txt index f747f95eee58..5f24586c8cf3 100644 --- a/Documentation/devicetree/bindings/power/renesas,apmu.txt +++ b/Documentation/devicetree/bindings/power/renesas,apmu.txt @@ -8,7 +8,9 @@ Required properties: - compatible: Should be "renesas,-apmu", "renesas,apmu" as fallback. Examples with soctypes are: - "renesas,r8a7743-apmu" (RZ/G1M) + - "renesas,r8a7744-apmu" (RZ/G1N) - "renesas,r8a7745-apmu" (RZ/G1E) + - "renesas,r8a77470-apmu" (RZ/G1C) - "renesas,r8a7790-apmu" (R-Car H2) - "renesas,r8a7791-apmu" (R-Car M2-W) - "renesas,r8a7792-apmu" (R-Car V2H) diff --git a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt index 180ae65be753..eae2a880155a 100644 --- a/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt +++ b/Documentation/devicetree/bindings/power/renesas,rcar-sysc.txt @@ -8,8 +8,11 @@ and various coprocessors. Required properties: - compatible: Must contain exactly one of the following: - "renesas,r8a7743-sysc" (RZ/G1M) + - "renesas,r8a7744-sysc" (RZ/G1N) - "renesas,r8a7745-sysc" (RZ/G1E) - "renesas,r8a77470-sysc" (RZ/G1C) + - "renesas,r8a774a1-sysc" (RZ/G2M) + - "renesas,r8a774c0-sysc" (RZ/G2E) - "renesas,r8a7779-sysc" (R-Car H1) - "renesas,r8a7790-sysc" (R-Car H2) - "renesas,r8a7791-sysc" (R-Car M2-W) diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt index 651491bb63b7..5705f575862d 100644 --- a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt +++ b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt @@ -6,7 +6,10 @@ and resin along with the Android reboot-mode. This DT node has pwrkey and resin as sub nodes. Required Properties: --compatible: "qcom,pm8916-pon" +-compatible: Must be one of: + "qcom,pm8916-pon" + "qcom,pms405-pon" + -reg: Specifies the physical address of the pon register Optional subnode: diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt index c9dd17d142ad..dc0568933359 100644 --- a/Documentation/devicetree/bindings/power/supply/bq25890.txt +++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt @@ -1,5 +1,8 @@ Binding for TI bq25890 Li-Ion Charger +This driver will support the bq25896 and the bq25890. There are other ICs +in the same family but those have not been tested. + Required properties: - compatible: Should contain one of the following: * "ti,bq25890" diff --git a/Documentation/devicetree/bindings/power/supply/bq27xxx.txt b/Documentation/devicetree/bindings/power/supply/bq27xxx.txt index 37994fdb18ca..4fa8e08df2b6 100644 --- a/Documentation/devicetree/bindings/power/supply/bq27xxx.txt +++ b/Documentation/devicetree/bindings/power/supply/bq27xxx.txt @@ -23,6 +23,7 @@ Required properties: * "ti,bq27546" - BQ27546 * "ti,bq27742" - BQ27742 * "ti,bq27545" - BQ27545 + * "ti,bq27411" - BQ27411 * "ti,bq27421" - BQ27421 * "ti,bq27425" - BQ27425 * "ti,bq27426" - BQ27426 diff --git a/Documentation/devicetree/bindings/power/supply/sc2731_charger.txt b/Documentation/devicetree/bindings/power/supply/sc2731_charger.txt new file mode 100644 index 000000000000..5266fab16575 --- /dev/null +++ b/Documentation/devicetree/bindings/power/supply/sc2731_charger.txt @@ -0,0 +1,40 @@ +Spreadtrum SC2731 PMIC battery charger binding + +Required properties: + - compatible: Should be "sprd,sc2731-charger". + - reg: Address offset of charger register. + - phys: Contains a phandle to the USB phy. + +Optional Properties: +- monitored-battery: phandle of battery characteristics devicetree node. + The charger uses the following battery properties: +- charge-term-current-microamp: current for charge termination phase. +- constant-charge-voltage-max-microvolt: maximum constant input voltage. + See Documentation/devicetree/bindings/power/supply/battery.txt + +Example: + + bat: battery { + compatible = "simple-battery"; + charge-term-current-microamp = <120000>; + constant-charge-voltage-max-microvolt = <4350000>; + ...... + }; + + sc2731_pmic: pmic@0 { + compatible = "sprd,sc2731"; + reg = <0>; + spi-max-frequency = <26000000>; + interrupts = ; + interrupt-controller; + #interrupt-cells = <2>; + #address-cells = <1>; + #size-cells = <0>; + + charger@0 { + compatible = "sprd,sc2731-charger"; + reg = <0x0>; + phys = <&ssphy>; + monitored-battery = <&bat>; + }; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt index 06a363d9ccef..b9a1d7402128 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt @@ -7,6 +7,7 @@ Required properties: for da850 - compatible = "ti,da850-ecap", "ti,am3352-ecap", "ti,am33xx-ecap"; for dra746 - compatible = "ti,dra746-ecap", "ti,am3352-ecap"; for 66ak2g - compatible = "ti,k2g-ecap", "ti,am3352-ecap"; + for am654 - compatible = "ti,am654-ecap", "ti,am3352-ecap"; - #pwm-cells: should be 3. See pwm.txt in this directory for a description of the cells format. The PWM channel index ranges from 0 to 4. The only third cell flag supported by this binding is PWM_POLARITY_INVERTED. diff --git a/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt b/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt index e1ef6afbe3a7..7f31fe7e2093 100644 --- a/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt +++ b/Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.txt @@ -3,7 +3,9 @@ Required Properties: - compatible: should be "renesas,pwm-rcar" and one of the following. - "renesas,pwm-r8a7743": for RZ/G1M + - "renesas,pwm-r8a7744": for RZ/G1N - "renesas,pwm-r8a7745": for RZ/G1E + - "renesas,pwm-r8a774a1": for RZ/G2M - "renesas,pwm-r8a7778": for R-Car M1A - "renesas,pwm-r8a7779": for R-Car H1 - "renesas,pwm-r8a7790": for R-Car H2 @@ -12,6 +14,8 @@ Required Properties: - "renesas,pwm-r8a7795": for R-Car H3 - "renesas,pwm-r8a7796": for R-Car M3-W - "renesas,pwm-r8a77965": for R-Car M3-N + - "renesas,pwm-r8a77970": for R-Car V3M + - "renesas,pwm-r8a77980": for R-Car V3H - "renesas,pwm-r8a77990": for R-Car E3 - "renesas,pwm-r8a77995": for R-Car D3 - reg: base address and length of the registers block for the PWM. diff --git a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt index d53a16715da6..848a92b53d81 100644 --- a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt @@ -2,13 +2,19 @@ Required Properties: - - compatible: should be one of the following. + - compatible: must contain one or more of the following: - "renesas,tpu-r8a73a4": for R8A73A4 (R-Mobile APE6) compatible PWM controller. - "renesas,tpu-r8a7740": for R8A7740 (R-Mobile A1) compatible PWM controller. - "renesas,tpu-r8a7743": for R8A7743 (RZ/G1M) compatible PWM controller. + - "renesas,tpu-r8a7744": for R8A7744 (RZ/G1N) compatible PWM controller. - "renesas,tpu-r8a7745": for R8A7745 (RZ/G1E) compatible PWM controller. - "renesas,tpu-r8a7790": for R8A7790 (R-Car H2) compatible PWM controller. - - "renesas,tpu": for generic R-Car and RZ/G1 TPU PWM controller. + - "renesas,tpu-r8a77970": for R8A77970 (R-Car V3M) compatible PWM + controller. + - "renesas,tpu-r8a77980": for R8A77980 (R-Car V3H) compatible PWM + controller. + - "renesas,tpu": for the generic TPU PWM controller; this is a fallback for + the entries listed above. - reg: Base address and length of each memory resource used by the PWM controller hardware module. diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt index c7610718adff..f9be1acf891c 100644 --- a/Documentation/devicetree/bindings/regulator/pfuze100.txt +++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt @@ -12,6 +12,11 @@ Optional properties: disabled. This binding is a workaround to keep backward compatibility with old dtb's which rely on the fact that the switched regulators are always on and don't mark them explicit as "regulator-always-on". +- fsl,pmic-stby-poweroff: if present, configure the PMIC to shutdown all + power rails when PMIC_STBY_REQ line is asserted during the power off sequence. + Use this option if the SoC should be powered off by external power + management IC (PMIC) on PMIC_STBY_REQ signal. + As opposite to PMIC_STBY_REQ boards can implement PMIC_ON_REQ signal. Required child node: - regulators: This is the list of child nodes that specify the regulator diff --git a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt index 58a1d97972f5..45025b5b67f6 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.txt @@ -26,6 +26,7 @@ Regulator nodes are identified by their compatible: "qcom,rpm-pm8998-regulators" "qcom,rpm-pma8084-regulators" "qcom,rpm-pmi8998-regulators" + "qcom,rpm-pms405-regulators" - vdd_s1-supply: - vdd_s2-supply: @@ -188,6 +189,24 @@ Regulator nodes are identified by their compatible: Definition: reference to regulator supplying the input pin, as described in the data sheet +- vdd_s1-supply: +- vdd_s2-supply: +- vdd_s3-supply: +- vdd_s4-supply: +- vdd_s5-supply: +- vdd_l1_l2-supply: +- vdd_l3_l8-supply: +- vdd_l4-supply: +- vdd_l5_l6-supply: +- vdd_l7-supply: +- vdd_l3_l8-supply: +- vdd_l9-supply: +- vdd_l10_l11_l12_l13-supply: + Usage: optional (pms405 only) + Value type: + Definition: reference to regulator supplying the input pin, as + described in the data sheet + The regulator node houses sub-nodes for each regulator within the device. Each sub-node is identified using the node's name, with valid values listed for each of the pmics below. @@ -222,6 +241,10 @@ pma8084: pmi8998: bob +pms405: + s1, s2, s3, s4, s5, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, + l13 + The content of each sub-node is defined by the standard binding for regulators - see regulator.txt. diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt index 76ead07072b1..4b98ca26e61a 100644 --- a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt @@ -1,7 +1,9 @@ -ROHM BD71837 Power Management Integrated Circuit (PMIC) regulator bindings +ROHM BD71837 and BD71847 Power Management Integrated Circuit regulator bindings Required properties: - - regulator-name: should be "buck1", ..., "buck8" and "ldo1", ..., "ldo7" + - regulator-name: should be "buck1", ..., "buck8" and "ldo1", ..., "ldo7" for + BD71837. For BD71847 names should be "buck1", ..., "buck6" + and "ldo1", ..., "ldo6" List of regulators provided by this controller. BD71837 regulators node should be sub node of the BD71837 MFD node. See BD71837 MFD bindings at @@ -16,10 +18,14 @@ disabled by driver at startup. LDO5 and LDO6 are supplied by those and if they are disabled at startup the voltage monitoring for LDO5/LDO6 will cause PMIC to reset. -The valid names for regulator nodes are: +The valid names for BD71837 regulator nodes are: BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6, BUCK7, BUCK8 LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7 +The valid names for BD71847 regulator nodes are: +BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6 +LDO1, LDO2, LDO3, LDO4, LDO5, LDO6 + Optional properties: - Any optional property defined in bindings/regulator/regulator.txt diff --git a/Documentation/devicetree/bindings/regulator/st,stpmic1-regulator.txt b/Documentation/devicetree/bindings/regulator/st,stpmic1-regulator.txt new file mode 100644 index 000000000000..a3f476240565 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/st,stpmic1-regulator.txt @@ -0,0 +1,68 @@ +STMicroelectronics STPMIC1 Voltage regulators + +Regulator Nodes are optional depending on needs. + +Available Regulators in STPMIC1 device are: + - buck1 for Buck BUCK1 + - buck2 for Buck BUCK2 + - buck3 for Buck BUCK3 + - buck4 for Buck BUCK4 + - ldo1 for LDO LDO1 + - ldo2 for LDO LDO2 + - ldo3 for LDO LDO3 + - ldo4 for LDO LDO4 + - ldo5 for LDO LDO5 + - ldo6 for LDO LDO6 + - vref_ddr for LDO Vref DDR + - boost for Buck BOOST + - pwr_sw1 for VBUS_OTG switch + - pwr_sw2 for SW_OUT switch + +Switches are fixed voltage regulators with only enable/disable capability. + +Optional properties: +- st,mask-reset: mask reset for this regulator: the regulator configuration + is maintained during pmic reset. +- regulator-pull-down: enable high pull down + if not specified light pull down is used +- regulator-over-current-protection: + if set, all regulators are switched off in case of over-current detection + on this regulator, + if not set, the driver only sends an over-current event. +- interrupt-parent: phandle to the parent interrupt controller +- interrupts: index of current limit detection interrupt +- -supply: phandle to the parent supply/regulator node + each regulator supply can be described except vref_ddr. + +Example: +regulators { + compatible = "st,stpmic1-regulators"; + + ldo6-supply = <&v3v3>; + + vdd_core: buck1 { + regulator-name = "vdd_core"; + interrupts = ; + interrupt-parent = <&pmic>; + st,mask-reset; + regulator-pull-down; + regulator-min-microvolt = <700000>; + regulator-max-microvolt = <1200000>; + }; + + v3v3: buck4 { + regulator-name = "v3v3"; + interrupts = ; + interrupt-parent = <&mypmic>; + + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + + v1v8: ldo6 { + regulator-name = "v1v8"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-over-current-protection; + }; +}; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt new file mode 100644 index 000000000000..a842a782b557 --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt @@ -0,0 +1,126 @@ +Qualcomm Technology Inc. ADSP Peripheral Image Loader + +This document defines the binding for a component that loads and boots firmware +on the Qualcomm Technology Inc. ADSP Hexagon core. + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,sdm845-adsp-pil" + +- reg: + Usage: required + Value type: + Definition: must specify the base address and size of the qdsp6ss register + +- interrupts-extended: + Usage: required + Value type: + Definition: must list the watchdog, fatal IRQs ready, handover and + stop-ack IRQs + +- interrupt-names: + Usage: required + Value type: + Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack" + +- clocks: + Usage: required + Value type: + Definition: List of 8 phandle and clock specifier pairs for the adsp. + +- clock-names: + Usage: required + Value type: + Definition: List of clock input name strings sorted in the same + order as the clocks property. Definition must have + "xo", "sway_cbcr", "lpass_aon", "lpass_ahbs_aon_cbcr", + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep" + and "qdsp6ss_core". + +- power-domains: + Usage: required + Value type: + Definition: reference to cx power domain node. + +- resets: + Usage: required + Value type: + Definition: reference to the list of 2 reset-controller for the adsp. + +- reset-names: + Usage: required + Value type: + Definition: must be "pdc_sync" and "cc_lpass" + +- qcom,halt-regs: + Usage: required + Value type: + Definition: a phandle reference to a syscon representing TCSR followed + by the offset within syscon for lpass halt register. + +- memory-region: + Usage: required + Value type: + Definition: reference to the reserved-memory for the ADSP + +- qcom,smem-states: + Usage: required + Value type: + Definition: reference to the smem state for requesting the ADSP to + shut down + +- qcom,smem-state-names: + Usage: required + Value type: + Definition: must be "stop" + + += SUBNODES +The adsp node may have an subnode named "glink-edge" that describes the +communication edge, channels and devices related to the ADSP. +See ../soc/qcom/qcom,glink.txt for details on how to describe these. + += EXAMPLE +The following example describes the resources needed to boot control the +ADSP, as it is found on SDM845 boards. + + remoteproc@17300000 { + compatible = "qcom,sdm845-adsp-pil"; + reg = <0x17300000 0x40c>; + + interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>; + interrupt-names = "wdog", "fatal", "ready", + "handover", "stop-ack"; + + clocks = <&rpmhcc RPMH_CXO_CLK>, + <&gcc GCC_LPASS_SWAY_CLK>, + <&lpasscc LPASS_AUDIO_WRAPPER_AON_CLK>, + <&lpasscc LPASS_Q6SS_AHBS_AON_CLK>, + <&lpasscc LPASS_Q6SS_AHBM_AON_CLK>, + <&lpasscc LPASS_QDSP6SS_XO_CLK>, + <&lpasscc LPASS_QDSP6SS_SLEEP_CLK>, + <&lpasscc LPASS_QDSP6SS_CORE_CLK>; + clock-names = "xo", "sway_cbcr", "lpass_aon", + "lpass_ahbs_aon_cbcr", + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", + "qdsp6ss_sleep", "qdsp6ss_core"; + + power-domains = <&rpmhpd SDM845_CX>; + + resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>, + <&aoss_reset AOSS_CC_LPASS_RESTART>; + reset-names = "pdc_sync", "cc_lpass"; + + qcom,halt-regs = <&tcsr_mutex_regs 0x22000>; + + memory-region = <&pil_adsp_mem>; + + qcom,smem-states = <&adsp_smp2p_out 0>; + qcom,smem-state-names = "stop"; + }; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 728e4193f7a6..9c0cff3a5ed8 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -10,6 +10,11 @@ on the Qualcomm ADSP Hexagon core. "qcom,msm8974-adsp-pil" "qcom,msm8996-adsp-pil" "qcom,msm8996-slpi-pil" + "qcom,qcs404-adsp-pas" + "qcom,qcs404-cdsp-pas" + "qcom,qcs404-wcss-pas" + "qcom,sdm845-adsp-pas" + "qcom,sdm845-cdsp-pas" - interrupts-extended: Usage: required diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 601dd9f389aa..9ff5b0309417 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -53,13 +53,17 @@ on the Qualcomm Hexagon core. Definition: reference to the reset-controller for the modem sub-system reference to the list of 3 reset-controllers for the wcss sub-system + reference to the list of 2 reset-controllers for the modem + sub-system on SDM845 SoCs - reset-names: Usage: required Value type: Definition: must be "mss_restart" for the modem sub-system - Definition: must be "wcss_aon_reset", "wcss_reset", "wcss_q6_reset" - for the wcss syb-system + must be "wcss_aon_reset", "wcss_reset", "wcss_q6_reset" + for the wcss sub-system + must be "mss_restart", "pdc_reset" for the modem + sub-system on SDM845 SoCs - cx-supply: - mss-supply: diff --git a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt index 5e1afc3d8480..1ab1d109318e 100644 --- a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt +++ b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt @@ -5,7 +5,7 @@ Please also refer to reset.txt in this directory for common reset controller binding usage. Required properties: -- compatible: Should be "fsl,imx7-src", "syscon" +- compatible: Should be "fsl,imx7d-src", "syscon" - reg: should be register base and length as documented in the datasheet - interrupts: Should contain SRC interrupt diff --git a/Documentation/devicetree/bindings/reset/qcom,pdc-global.txt b/Documentation/devicetree/bindings/reset/qcom,pdc-global.txt new file mode 100644 index 000000000000..a62a492843e7 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/qcom,pdc-global.txt @@ -0,0 +1,52 @@ +PDC Global +====================================== + +This binding describes a reset-controller found on PDC-Global (Power Domain +Controller) block for Qualcomm Technologies Inc SDM845 SoCs. + +Required properties: +- compatible: + Usage: required + Value type: + Definition: must be: + "qcom,sdm845-pdc-global" + +- reg: + Usage: required + Value type: + Definition: must specify the base address and size of the register + space. + +- #reset-cells: + Usage: required + Value type: + Definition: must be 1; cell entry represents the reset index. + +Example: + +pdc_reset: reset-controller@b2e0000 { + compatible = "qcom,sdm845-pdc-global"; + reg = <0xb2e0000 0x20000>; + #reset-cells = <1>; +}; + +PDC reset clients +====================================== + +Device nodes that need access to reset lines should +specify them as a reset phandle in their corresponding node as +specified in reset.txt. + +For a list of all valid reset indices see + + +Example: + +modem-pil@4080000 { + ... + + resets = <&pdc_reset PDC_MODEM_SYNC_RESET>; + reset-names = "pdc_reset"; + + ... +}; diff --git a/Documentation/devicetree/bindings/reset/renesas,rst.txt b/Documentation/devicetree/bindings/reset/renesas,rst.txt index 67e83b02e10b..b03c48a1150e 100644 --- a/Documentation/devicetree/bindings/reset/renesas,rst.txt +++ b/Documentation/devicetree/bindings/reset/renesas,rst.txt @@ -16,8 +16,11 @@ Required properties: - "renesas,-rst" for R-Car Gen2 and Gen3, and RZ/G Examples with soctypes are: - "renesas,r8a7743-rst" (RZ/G1M) + - "renesas,r8a7744-rst" (RZ/G1N) - "renesas,r8a7745-rst" (RZ/G1E) - "renesas,r8a77470-rst" (RZ/G1C) + - "renesas,r8a774a1-rst" (RZ/G2M) + - "renesas,r8a774c0-rst" (RZ/G2E) - "renesas,r8a7778-reset-wdt" (R-Car M1A) - "renesas,r8a7779-reset-wdt" (R-Car H1) - "renesas,r8a7790-rst" (R-Car H2) diff --git a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt index eaca9da79d83..e52e16c6bc57 100644 --- a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt +++ b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt @@ -14,6 +14,10 @@ Required properties: - "renesas,scifa-r8a7743" for R8A7743 (RZ/G1M) SCIFA compatible UART. - "renesas,scifb-r8a7743" for R8A7743 (RZ/G1M) SCIFB compatible UART. - "renesas,hscif-r8a7743" for R8A7743 (RZ/G1M) HSCIF compatible UART. + - "renesas,scif-r8a7744" for R8A7744 (RZ/G1N) SCIF compatible UART. + - "renesas,scifa-r8a7744" for R8A7744 (RZ/G1N) SCIFA compatible UART. + - "renesas,scifb-r8a7744" for R8A7744 (RZ/G1N) SCIFB compatible UART. + - "renesas,hscif-r8a7744" for R8A7744 (RZ/G1N) HSCIF compatible UART. - "renesas,scif-r8a7745" for R8A7745 (RZ/G1E) SCIF compatible UART. - "renesas,scifa-r8a7745" for R8A7745 (RZ/G1E) SCIFA compatible UART. - "renesas,scifb-r8a7745" for R8A7745 (RZ/G1E) SCIFB compatible UART. @@ -50,6 +54,8 @@ Required properties: - "renesas,hscif-r8a77970" for R8A77970 (R-Car V3M) HSCIF compatible UART. - "renesas,scif-r8a77980" for R8A77980 (R-Car V3H) SCIF compatible UART. - "renesas,hscif-r8a77980" for R8A77980 (R-Car V3H) HSCIF compatible UART. + - "renesas,scif-r8a77990" for R8A77990 (R-Car E3) SCIF compatible UART. + - "renesas,hscif-r8a77990" for R8A77990 (R-Car E3) HSCIF compatible UART. - "renesas,scif-r8a77995" for R8A77995 (R-Car D3) SCIF compatible UART. - "renesas,hscif-r8a77995" for R8A77995 (R-Car D3) HSCIF compatible UART. - "renesas,scifa-sh73a0" for SH73A0 (SH-Mobile AG5) SCIFA compatible UART. diff --git a/Documentation/devicetree/bindings/serial/uniphier-uart.txt b/Documentation/devicetree/bindings/serial/uniphier-uart.txt index 0b3892a7a528..7a1bf02bb869 100644 --- a/Documentation/devicetree/bindings/serial/uniphier-uart.txt +++ b/Documentation/devicetree/bindings/serial/uniphier-uart.txt @@ -7,7 +7,7 @@ Required properties: - clocks: phandle to the input clock. Optional properties: -- fifo-size: the RX/TX FIFO size. Defaults to 64 if not specified. +-auto-flow-control: enable automatic flow control support. Example: aliases { @@ -19,5 +19,4 @@ Example: reg = <0x54006800 0x40>; interrupts = <0 33 4>; clocks = <&uart_clk>; - fifo-size = <64>; }; diff --git a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt new file mode 100644 index 000000000000..436d2106e80d --- /dev/null +++ b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt @@ -0,0 +1,29 @@ +Amlogic Canvas +================================ + +A canvas is a collection of metadata that describes a pixel buffer. +Those metadata include: width, height, phyaddr, wrapping, block mode +and endianness. + +Many IPs within Amlogic SoCs rely on canvas indexes to read/write pixel data +rather than use the phy addresses directly. For instance, this is the case for +the video decoders and the display. + +Amlogic SoCs have 256 canvas. + +Device Tree Bindings: +--------------------- + +Video Lookup Table +-------------------------- + +Required properties: +- compatible: "amlogic,canvas" +- reg: Base physical address and size of the canvas registers. + +Example: + +canvas: video-lut@48 { + compatible = "amlogic,canvas"; + reg = <0x0 0x48 0x0 0x14>; +}; diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/network.txt b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/network.txt index 03c741602c6d..6d2dd8a31482 100644 --- a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/network.txt +++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/network.txt @@ -98,6 +98,12 @@ The property below is dependent on fsl,tdm-interface: usage: optional for tdm interface value type: Definition : Internal loopback connecting on TDM layer. +- fsl,hmask + usage: optional + Value type: + Definition: HDLC address recognition. Set to zero to disable + address filtering of packets: + fsl,hmask = /bits/ 16 <0x0000>; Example for tdm interface: diff --git a/Documentation/devicetree/bindings/soc/mediatek/pwrap.txt b/Documentation/devicetree/bindings/soc/mediatek/pwrap.txt index f9987c30f0d5..5a2ef1726e2a 100644 --- a/Documentation/devicetree/bindings/soc/mediatek/pwrap.txt +++ b/Documentation/devicetree/bindings/soc/mediatek/pwrap.txt @@ -19,10 +19,12 @@ IP Pairing Required properties in pwrap device node. - compatible: "mediatek,mt2701-pwrap" for MT2701/7623 SoCs + "mediatek,mt6765-pwrap" for MT6765 SoCs "mediatek,mt6797-pwrap" for MT6797 SoCs "mediatek,mt7622-pwrap" for MT7622 SoCs "mediatek,mt8135-pwrap" for MT8135 SoCs "mediatek,mt8173-pwrap" for MT8173 SoCs + "mediatek,mt8183-pwrap" for MT8183 SoCs - interrupts: IRQ for pwrap in SOC - reg-names: Must include the following entries: "pwrap": Main registers base diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt index ff92e5a41bed..dab7ca9f250c 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt @@ -53,20 +53,8 @@ Required properties: - clocks: Serial engine core clock needed by the device. Qualcomm Technologies Inc. GENI Serial Engine based SPI Controller - -Required properties: -- compatible: Must contain "qcom,geni-spi". -- reg: Must contain SPI register location and length. -- interrupts: Must contain SPI controller interrupts. -- clock-names: Must contain "se". -- clocks: Serial engine core clock needed by the device. -- spi-max-frequency: Specifies maximum SPI clock frequency, units - Hz. -- #address-cells: Must be <1> to define a chip select address on - the SPI bus. -- #size-cells: Must be <0>. - -SPI slave nodes must be children of the SPI master node and conform to SPI bus -binding as described in Documentation/devicetree/bindings/spi/spi-bus.txt. +node binding is described in +Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.txt. Example: geniqup@8c0000 { @@ -103,17 +91,4 @@ Example: pinctrl-1 = <&qup_1_uart_3_sleep>; }; - spi0: spi@a84000 { - compatible = "qcom,geni-spi"; - reg = <0xa84000 0x4000>; - interrupts = ; - clock-names = "se"; - clocks = <&clock_gcc GCC_QUPV3_WRAP0_S0_CLK>; - pinctrl-names = "default", "sleep"; - pinctrl-0 = <&qup_1_spi_2_active>; - pinctrl-1 = <&qup_1_spi_2_sleep>; - spi-max-frequency = <19200000>; - #address-cells = <1>; - #size-cells = <0>; - }; } diff --git a/Documentation/devicetree/bindings/soc/rockchip/grf.txt b/Documentation/devicetree/bindings/soc/rockchip/grf.txt index 7dc5ce858a0e..46e27cd69f18 100644 --- a/Documentation/devicetree/bindings/soc/rockchip/grf.txt +++ b/Documentation/devicetree/bindings/soc/rockchip/grf.txt @@ -13,6 +13,7 @@ On RK3328 SoCs, the GRF adds a section for USB2PHYGRF, Required Properties: - compatible: GRF should be one of the following: + - "rockchip,px30-grf", "syscon": for px30 - "rockchip,rk3036-grf", "syscon": for rk3036 - "rockchip,rk3066-grf", "syscon": for rk3066 - "rockchip,rk3188-grf", "syscon": for rk3188 @@ -23,6 +24,7 @@ Required Properties: - "rockchip,rk3399-grf", "syscon": for rk3399 - "rockchip,rv1108-grf", "syscon": for rv1108 - compatible: PMUGRF should be one of the following: + - "rockchip,px30-pmugrf", "syscon": for px30 - "rockchip,rk3368-pmugrf", "syscon": for rk3368 - "rockchip,rk3399-pmugrf", "syscon": for rk3399 - compatible: SGRF should be one of the following diff --git a/Documentation/devicetree/bindings/sound/adi,adau1977.txt b/Documentation/devicetree/bindings/sound/adi,adau1977.txt new file mode 100644 index 000000000000..e79aeef73f28 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/adi,adau1977.txt @@ -0,0 +1,54 @@ +Analog Devices ADAU1977/ADAU1978/ADAU1979 + +Datasheets: +http://www.analog.com/media/en/technical-documentation/data-sheets/ADAU1977.pdf +http://www.analog.com/media/en/technical-documentation/data-sheets/ADAU1978.pdf +http://www.analog.com/media/en/technical-documentation/data-sheets/ADAU1979.pdf + +This driver supports both the I2C and SPI bus. + +Required properties: + - compatible: Should contain one of the following: + "adi,adau1977" + "adi,adau1978" + "adi,adau1979" + + - AVDD-supply: analog power supply for the device, please consult + Documentation/devicetree/bindings/regulator/regulator.txt + +Optional properties: + - reset-gpio: the reset pin for the chip, for more details consult + Documentation/devicetree/bindings/gpio/gpio.txt + + - DVDD-supply: supply voltage for the digital core, please consult + Documentation/devicetree/bindings/regulator/regulator.txt + +For required properties on SPI, please consult +Documentation/devicetree/bindings/spi/spi-bus.txt + +Required properties on I2C: + + - reg: The i2c address. Value depends on the state of ADDR0 + and ADDR1, as wired in hardware. + +Examples: + + adau1977_spi: adau1977@0 { + compatible = "adi,adau1977"; + spi-max-frequency = <600000>; + + AVDD-supply = <®ulator>; + DVDD-supply = <®ulator_digital>; + + reset_gpio = <&gpio 10 GPIO_ACTIVE_LOW>; + }; + + adau1977_i2c: adau1977@11 { + compatible = "adi,adau1977"; + reg = <0x11>; + + AVDD-supply = <®ulator>; + DVDD-supply = <®ulator_digital>; + + reset_gpio = <&gpio 10 GPIO_ACTIVE_LOW>; + }; diff --git a/Documentation/devicetree/bindings/sound/amlogic,axg-pdm.txt b/Documentation/devicetree/bindings/sound/amlogic,axg-pdm.txt new file mode 100644 index 000000000000..5672d0bc5b16 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/amlogic,axg-pdm.txt @@ -0,0 +1,24 @@ +* Amlogic Audio PDM input + +Required properties: +- compatible: 'amlogic,axg-pdm' +- reg: physical base address of the controller and length of memory + mapped region. +- clocks: list of clock phandle, one for each entry clock-names. +- clock-names: should contain the following: + * "pclk" : peripheral clock. + * "dclk" : pdm digital clock + * "sysclk" : dsp system clock +- #sound-dai-cells: must be 0. + +Example of PDM on the A113 SoC: + +pdm: audio-controller@ff632000 { + compatible = "amlogic,axg-pdm"; + reg = <0x0 0xff632000 0x0 0x34>; + #sound-dai-cells = <0>; + clocks = <&clkc_audio AUD_CLKID_PDM>, + <&clkc_audio AUD_CLKID_PDM_DCLK>, + <&clkc_audio AUD_CLKID_PDM_SYSCLK>; + clock-names = "pclk", "dclk", "sysclk"; +}; diff --git a/Documentation/devicetree/bindings/sound/cs42l51.txt b/Documentation/devicetree/bindings/sound/cs42l51.txt new file mode 100644 index 000000000000..4b5de33ce377 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/cs42l51.txt @@ -0,0 +1,17 @@ +CS42L51 audio CODEC + +Optional properties: + + - clocks : a list of phandles + clock-specifiers, one for each entry in + clock-names + + - clock-names : must contain "MCLK" + +Example: + +cs42l51: cs42l51@4a { + compatible = "cirrus,cs42l51"; + reg = <0x4a>; + clocks = <&mclk_prov>; + clock-names = "MCLK"; +}; diff --git a/Documentation/devicetree/bindings/sound/maxim,max98088.txt b/Documentation/devicetree/bindings/sound/maxim,max98088.txt new file mode 100644 index 000000000000..da764d913319 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/maxim,max98088.txt @@ -0,0 +1,23 @@ +MAX98088 audio CODEC + +This device supports I2C only. + +Required properties: + +- compatible: "maxim,max98088" or "maxim,max98089". +- reg: The I2C address of the device. + +Optional properties: + +- clocks: the clock provider of MCLK, see ../clock/clock-bindings.txt section + "consumer" for more information. +- clock-names: must be set to "mclk" + +Example: + +max98089: codec@10 { + compatible = "maxim,max98089"; + reg = <0x10>; + clocks = <&clks IMX6QDL_CLK_CKO2>; + clock-names = "mclk"; +}; diff --git a/Documentation/devicetree/bindings/sound/mikroe,mikroe-proto.txt b/Documentation/devicetree/bindings/sound/mikroe,mikroe-proto.txt new file mode 100644 index 000000000000..912f8fae11c5 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mikroe,mikroe-proto.txt @@ -0,0 +1,23 @@ +Mikroe-PROTO audio board + +Required properties: + - compatible: "mikroe,mikroe-proto" + - dai-format: Must be "i2s". + - i2s-controller: The phandle of the I2S controller. + - audio-codec: The phandle of the WM8731 audio codec. +Optional properties: + - model: The user-visible name of this sound complex. + - bitclock-master: Indicates dai-link bit clock master; for details see simple-card.txt (1). + - frame-master: Indicates dai-link frame master; for details see simple-card.txt (1). + +(1) : There must be the same master for both bit and frame clocks. + +Example: + sound { + compatible = "mikroe,mikroe-proto"; + model = "wm8731 @ sama5d2_xplained"; + i2s-controller = <&i2s0>; + audio-codec = <&wm8731>; + dai-format = "i2s"; + }; +}; diff --git a/Documentation/devicetree/bindings/sound/nau8822.txt b/Documentation/devicetree/bindings/sound/nau8822.txt new file mode 100644 index 000000000000..a471d162d4e5 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/nau8822.txt @@ -0,0 +1,16 @@ +NAU8822 audio CODEC + +This device supports I2C only. + +Required properties: + + - compatible : "nuvoton,nau8822" + + - reg : the I2C address of the device. + +Example: + +codec: nau8822@1a { + compatible = "nuvoton,nau8822"; + reg = <0x1a>; +}; diff --git a/Documentation/devicetree/bindings/sound/pcm3060.txt b/Documentation/devicetree/bindings/sound/pcm3060.txt new file mode 100644 index 000000000000..90fcb8523099 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/pcm3060.txt @@ -0,0 +1,17 @@ +PCM3060 audio CODEC + +This driver supports both I2C and SPI. + +Required properties: + +- compatible: "ti,pcm3060" + +- reg : the I2C address of the device for I2C, the chip select + number for SPI. + +Examples: + + pcm3060: pcm3060@46 { + compatible = "ti,pcm3060"; + reg = <0x46>; + }; diff --git a/Documentation/devicetree/bindings/sound/qcom,q6afe.txt b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt index a8179409c194..d74888b9f1bb 100644 --- a/Documentation/devicetree/bindings/sound/qcom,q6afe.txt +++ b/Documentation/devicetree/bindings/sound/qcom,q6afe.txt @@ -49,7 +49,7 @@ configuration of each dai. Must contain the following properties. Usage: required for mi2s interface Value type: Definition: Must be list of serial data lines used by this dai. - should be one or more of the 1-4 sd lines. + should be one or more of the 0-3 sd lines. - qcom,tdm-sync-mode: Usage: required for tdm interface @@ -137,42 +137,42 @@ q6afe@4 { prim-mi2s-rx@16 { reg = <16>; - qcom,sd-lines = <1 3>; + qcom,sd-lines = <0 2>; }; prim-mi2s-tx@17 { reg = <17>; - qcom,sd-lines = <2>; + qcom,sd-lines = <1>; }; sec-mi2s-rx@18 { reg = <18>; - qcom,sd-lines = <1 4>; + qcom,sd-lines = <0 3>; }; sec-mi2s-tx@19 { reg = <19>; - qcom,sd-lines = <2>; + qcom,sd-lines = <1>; }; tert-mi2s-rx@20 { reg = <20>; - qcom,sd-lines = <2 4>; + qcom,sd-lines = <1 3>; }; tert-mi2s-tx@21 { reg = <21>; - qcom,sd-lines = <1>; + qcom,sd-lines = <0>; }; quat-mi2s-rx@22 { reg = <22>; - qcom,sd-lines = <1>; + qcom,sd-lines = <0>; }; quat-mi2s-tx@23 { reg = <23>; - qcom,sd-lines = <2>; + qcom,sd-lines = <1>; }; }; }; diff --git a/Documentation/devicetree/bindings/sound/renesas,rsnd.txt b/Documentation/devicetree/bindings/sound/renesas,rsnd.txt index 9e764270c36b..d92b705e7917 100644 --- a/Documentation/devicetree/bindings/sound/renesas,rsnd.txt +++ b/Documentation/devicetree/bindings/sound/renesas,rsnd.txt @@ -340,10 +340,12 @@ Required properties: - compatible : "renesas,rcar_sound-", fallbacks "renesas,rcar_sound-gen1" if generation1, and "renesas,rcar_sound-gen2" if generation2 (or RZ/G1) - "renesas,rcar_sound-gen3" if generation3 + "renesas,rcar_sound-gen3" if generation3 (or RZ/G2) Examples with soctypes are: - "renesas,rcar_sound-r8a7743" (RZ/G1M) + - "renesas,rcar_sound-r8a7744" (RZ/G1N) - "renesas,rcar_sound-r8a7745" (RZ/G1E) + - "renesas,rcar_sound-r8a774a1" (RZ/G2M) - "renesas,rcar_sound-r8a7778" (R-Car M1A) - "renesas,rcar_sound-r8a7779" (R-Car H1) - "renesas,rcar_sound-r8a7790" (R-Car H2) @@ -353,6 +355,7 @@ Required properties: - "renesas,rcar_sound-r8a7795" (R-Car H3) - "renesas,rcar_sound-r8a7796" (R-Car M3-W) - "renesas,rcar_sound-r8a77965" (R-Car M3-N) + - "renesas,rcar_sound-r8a77990" (R-Car E3) - reg : Should contain the register physical address. required register is SRU/ADG/SSI if generation1 diff --git a/Documentation/devicetree/bindings/sound/st,sta32x.txt b/Documentation/devicetree/bindings/sound/st,sta32x.txt index 255de3ae5b2f..52265fb757c5 100644 --- a/Documentation/devicetree/bindings/sound/st,sta32x.txt +++ b/Documentation/devicetree/bindings/sound/st,sta32x.txt @@ -19,6 +19,10 @@ Required properties: Optional properties: + - clocks, clock-names: Clock specifier for XTI input clock. + If specified, the clock will be enabled when the codec is probed, + and disabled when it is removed. The 'clock-names' must be set to 'xti'. + - st,output-conf: number, Selects the output configuration: 0: 2-channel (full-bridge) power, 2-channel data-out 1: 2 (half-bridge). 1 (full-bridge) on-board power @@ -39,6 +43,9 @@ Optional properties: - st,thermal-warning-recover: If present, thermal warning recovery is enabled. + - st,fault-detect-recovery: + If present, fault detect recovery is enabled. + - st,thermal-warning-adjustment: If present, thermal warning adjustment is enabled. @@ -76,6 +83,8 @@ Example: codec: sta32x@38 { compatible = "st,sta32x"; reg = <0x1c>; + clocks = <&clock>; + clock-names = "xti"; reset-gpios = <&gpio1 19 0>; power-down-gpios = <&gpio1 16 0>; st,output-conf = /bits/ 8 <0x3>; // set output to 2-channel diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt index 3a3fc506e43a..3f4467ff0aa2 100644 --- a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt +++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt @@ -31,7 +31,11 @@ SAI subnodes required properties: - reg: Base address and size of SAI sub-block register set. - clocks: Must contain one phandle and clock specifier pair for sai_ck which feeds the internal clock generator. + If the SAI shares a master clock, with another SAI set as MCLK + clock provider, SAI provider phandle must be specified here. - clock-names: Must contain "sai_ck". + Must also contain "MCLK", if SAI shares a master clock, + with a SAI set as MCLK clock provider. - dmas: see Documentation/devicetree/bindings/dma/stm32-dma.txt - dma-names: identifier string for each DMA request line "tx": if sai sub-block is configured as playback DAI @@ -51,6 +55,9 @@ SAI subnodes Optional properties: configured according to protocol defined in related DAI link node, such as i2s, left justified, right justified, dsp and pdm protocols. Note: ac97 protocol is not supported by SAI driver + - #clock-cells: should be 0. This property must be present if the SAI device + is a master clock provider, according to clocks bindings, described in + Documentation/devicetree/bindings/clock/clock-bindings.txt. The device node should contain one 'port' child node with one child 'endpoint' node, according to the bindings defined in Documentation/devicetree/bindings/ diff --git a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt index b9d50d6cdef3..61e71c1729e0 100644 --- a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt +++ b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt @@ -10,6 +10,7 @@ Required properties: - "allwinner,sun6i-a31-i2s" - "allwinner,sun8i-a83t-i2s" - "allwinner,sun8i-h3-i2s" + - "allwinner,sun50i-a64-codec-i2s" - reg: physical base address of the controller and length of memory mapped region. - interrupts: should contain the I2S interrupt. @@ -26,6 +27,7 @@ Required properties for the following compatibles: - "allwinner,sun6i-a31-i2s" - "allwinner,sun8i-a83t-i2s" - "allwinner,sun8i-h3-i2s" + - "allwinner,sun50i-a64-codec-i2s" - resets: phandle to the reset line for this codec Example: diff --git a/Documentation/devicetree/bindings/sound/sun50i-codec-analog.txt b/Documentation/devicetree/bindings/sound/sun50i-codec-analog.txt new file mode 100644 index 000000000000..4f8ad0e04d20 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/sun50i-codec-analog.txt @@ -0,0 +1,12 @@ +* Allwinner A64 Codec Analog Controls + +Required properties: +- compatible: must be one of the following compatibles: + - "allwinner,sun50i-a64-codec-analog" +- reg: must contain the registers location and length + +Example: + codec_analog: codec-analog@1f015c0 { + compatible = "allwinner,sun50i-a64-codec-analog"; + reg = <0x01f015c0 0x4>; + }; diff --git a/Documentation/devicetree/bindings/sound/ts3a227e.txt b/Documentation/devicetree/bindings/sound/ts3a227e.txt index 3ed8359144d3..21ab45bc7e8f 100644 --- a/Documentation/devicetree/bindings/sound/ts3a227e.txt +++ b/Documentation/devicetree/bindings/sound/ts3a227e.txt @@ -14,7 +14,7 @@ Required properties: Optional properies: - ti,micbias: Intended MICBIAS voltage (datasheet section 9.6.7). - Select 0/1/2/3/4/5/6/7 to specify MACBIAS voltage + Select 0/1/2/3/4/5/6/7 to specify MICBIAS voltage 2.1V/2.2V/2.3V/2.4V/2.5V/2.6V/2.7V/2.8V Default value is "1" (2.2V). diff --git a/Documentation/devicetree/bindings/sound/wm8782.txt b/Documentation/devicetree/bindings/sound/wm8782.txt new file mode 100644 index 000000000000..256cdec6ec4d --- /dev/null +++ b/Documentation/devicetree/bindings/sound/wm8782.txt @@ -0,0 +1,17 @@ +WM8782 stereo ADC + +This device does not have any control interface or reset pins. + +Required properties: + + - compatible : "wlf,wm8782" + - Vdda-supply : phandle to a regulator for the analog power supply (2.7V - 5.5V) + - Vdd-supply : phandle to a regulator for the digital power supply (2.7V - 3.6V) + +Example: + +wm8782: stereo-adc { + compatible = "wlf,wm8782"; + Vdda-supply = <&vdda_supply>; + Vdd-supply = <&vdd_supply>; +}; diff --git a/Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.txt b/Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.txt new file mode 100644 index 000000000000..790311a42bf1 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.txt @@ -0,0 +1,39 @@ +GENI based Qualcomm Universal Peripheral (QUP) Serial Peripheral Interface (SPI) + +The QUP v3 core is a GENI based AHB slave that provides a common data path +(an output FIFO and an input FIFO) for serial peripheral interface (SPI) +mini-core. + +SPI in master mode supports up to 50MHz, up to four chip selects, programmable +data path from 4 bits to 32 bits and numerous protocol variants. + +Required properties: +- compatible: Must contain "qcom,geni-spi". +- reg: Must contain SPI register location and length. +- interrupts: Must contain SPI controller interrupts. +- clock-names: Must contain "se". +- clocks: Serial engine core clock needed by the device. +- #address-cells: Must be <1> to define a chip select address on + the SPI bus. +- #size-cells: Must be <0>. + +SPI Controller nodes must be child of GENI based Qualcomm Universal +Peripharal. Please refer GENI based QUP wrapper controller node bindings +described in Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt. + +SPI slave nodes must be children of the SPI master node and conform to SPI bus +binding as described in Documentation/devicetree/bindings/spi/spi-bus.txt. + +Example: + spi0: spi@a84000 { + compatible = "qcom,geni-spi"; + reg = <0xa84000 0x4000>; + interrupts = ; + clock-names = "se"; + clocks = <&clock_gcc GCC_QUPV3_WRAP0_S0_CLK>; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&qup_1_spi_2_active>; + pinctrl-1 = <&qup_1_spi_2_sleep>; + #address-cells = <1>; + #size-cells = <0>; + }; diff --git a/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.txt b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.txt new file mode 100644 index 000000000000..1d64b61f5171 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.txt @@ -0,0 +1,36 @@ +Qualcomm Quad Serial Peripheral Interface (QSPI) + +The QSPI controller allows SPI protocol communication in single, dual, or quad +wire transmission modes for read/write access to slaves such as NOR flash. + +Required properties: +- compatible: An SoC specific identifier followed by "qcom,qspi-v1", such as + "qcom,sdm845-qspi", "qcom,qspi-v1" +- reg: Should contain the base register location and length. +- interrupts: Interrupt number used by the controller. +- clocks: Should contain the core and AHB clock. +- clock-names: Should be "core" for core clock and "iface" for AHB clock. + +SPI slave nodes must be children of the SPI master node and can contain +properties described in Documentation/devicetree/bindings/spi/spi-bus.txt + +Example: + + qspi: spi@88df000 { + compatible = "qcom,sdm845-qspi", "qcom,qspi-v1"; + reg = <0x88df000 0x600>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = ; + clock-names = "iface", "core"; + clocks = <&gcc GCC_QSPI_CNOC_PERIPH_AHB_CLK>, + <&gcc GCC_QSPI_CORE_CLK>; + + flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <25000000>; + spi-tx-bus-width = <2>; + spi-rx-bus-width = <2>; + }; + }; diff --git a/Documentation/devicetree/bindings/spi/sh-msiof.txt b/Documentation/devicetree/bindings/spi/sh-msiof.txt index bfbc2035fb6b..4b836ad17b19 100644 --- a/Documentation/devicetree/bindings/spi/sh-msiof.txt +++ b/Documentation/devicetree/bindings/spi/sh-msiof.txt @@ -2,7 +2,9 @@ Renesas MSIOF spi controller Required properties: - compatible : "renesas,msiof-r8a7743" (RZ/G1M) + "renesas,msiof-r8a7744" (RZ/G1N) "renesas,msiof-r8a7745" (RZ/G1E) + "renesas,msiof-r8a774a1" (RZ/G2M) "renesas,msiof-r8a7790" (R-Car H2) "renesas,msiof-r8a7791" (R-Car M2-W) "renesas,msiof-r8a7792" (R-Car V2H) @@ -11,10 +13,14 @@ Required properties: "renesas,msiof-r8a7795" (R-Car H3) "renesas,msiof-r8a7796" (R-Car M3-W) "renesas,msiof-r8a77965" (R-Car M3-N) + "renesas,msiof-r8a77970" (R-Car V3M) + "renesas,msiof-r8a77980" (R-Car V3H) + "renesas,msiof-r8a77990" (R-Car E3) + "renesas,msiof-r8a77995" (R-Car D3) "renesas,msiof-sh73a0" (SH-Mobile AG5) "renesas,sh-mobile-msiof" (generic SH-Mobile compatibile device) "renesas,rcar-gen2-msiof" (generic R-Car Gen2 and RZ/G1 compatible device) - "renesas,rcar-gen3-msiof" (generic R-Car Gen3 compatible device) + "renesas,rcar-gen3-msiof" (generic R-Car Gen3 and RZ/G2 compatible device) "renesas,sh-msiof" (deprecated) When compatible with the generic version, nodes diff --git a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.txt b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.txt index 642d3fb1ef85..2864bc6b659c 100644 --- a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.txt +++ b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.txt @@ -2,7 +2,7 @@ Synopsys DesignWare AMBA 2.0 Synchronous Serial Interface. Required properties: - compatible : "snps,dw-apb-ssi" or "mscc,-spi", where soc is "ocelot" or - "jaguar2" + "jaguar2", or "amazon,alpine-dw-apb-ssi" - reg : The register base for the controller. For "mscc,-spi", a second register set is required (named ICPU_CFG:SPI_MST) - interrupts : One interrupt, used by the controller. diff --git a/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.txt b/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.txt index 4af132606b37..8d178a4503cf 100644 --- a/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.txt +++ b/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.txt @@ -3,6 +3,7 @@ Required properties: - compatible : - "fsl,imx7ulp-spi" for LPSPI compatible with the one integrated on i.MX7ULP soc + - "fsl,imx8qxp-spi" for LPSPI compatible with the one integrated on i.MX8QXP soc - reg : address and length of the lpspi master registers - interrupts : lpspi interrupt - clocks : lpspi clock specifier diff --git a/Documentation/devicetree/bindings/spi/spi-pxa2xx.txt b/Documentation/devicetree/bindings/spi/spi-pxa2xx.txt new file mode 100644 index 000000000000..0335a9bd2e8a --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-pxa2xx.txt @@ -0,0 +1,24 @@ +PXA2xx SSP SPI Controller + +Required properties: +- compatible: Must be "marvell,mmp2-ssp". +- reg: Offset and length of the device's register set. +- interrupts: Should be the interrupt number. +- clocks: Should contain a single entry describing the clock input. +- #address-cells: Number of cells required to define a chip select address. +- #size-cells: Should be zero. + +Optional properties: +- cs-gpios: list of GPIO chip selects. See the SPI bus bindings, + Documentation/devicetree/bindings/spi/spi-bus.txt + +Child nodes represent devices on the SPI bus + See ../spi/spi-bus.txt + +Example: + ssp1: spi@d4035000 { + compatible = "marvell,mmp2-ssp"; + reg = <0xd4035000 0x1000>; + clocks = <&soc_clocks MMP2_CLK_SSP0>; + interrupts = <0>; + }; diff --git a/Documentation/devicetree/bindings/spi/spi-rspi.txt b/Documentation/devicetree/bindings/spi/spi-rspi.txt index 96fd58548f69..fc97ad64fbf2 100644 --- a/Documentation/devicetree/bindings/spi/spi-rspi.txt +++ b/Documentation/devicetree/bindings/spi/spi-rspi.txt @@ -3,7 +3,7 @@ Device tree configuration for Renesas RSPI/QSPI driver Required properties: - compatible : For Renesas Serial Peripheral Interface on legacy SH: "renesas,rspi-", "renesas,rspi" as fallback. - For Renesas Serial Peripheral Interface on RZ/A1H: + For Renesas Serial Peripheral Interface on RZ/A: "renesas,rspi-", "renesas,rspi-rz" as fallback. For Quad Serial Peripheral Interface on R-Car Gen2 and RZ/G1 devices: @@ -11,7 +11,9 @@ Required properties: Examples with soctypes are: - "renesas,rspi-sh7757" (SH) - "renesas,rspi-r7s72100" (RZ/A1H) + - "renesas,rspi-r7s9210" (RZ/A2) - "renesas,qspi-r8a7743" (RZ/G1M) + - "renesas,qspi-r8a7744" (RZ/G1N) - "renesas,qspi-r8a7745" (RZ/G1E) - "renesas,qspi-r8a7790" (R-Car H2) - "renesas,qspi-r8a7791" (R-Car M2-W) diff --git a/Documentation/devicetree/bindings/spi/spi-slave-mt27xx.txt b/Documentation/devicetree/bindings/spi/spi-slave-mt27xx.txt new file mode 100644 index 000000000000..c37e5a179b21 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-slave-mt27xx.txt @@ -0,0 +1,32 @@ +Binding for MTK SPI Slave controller + +Required properties: +- compatible: should be one of the following. + - mediatek,mt2712-spi-slave: for mt2712 platforms +- reg: Address and length of the register set for the device. +- interrupts: Should contain spi interrupt. +- clocks: phandles to input clocks. + It's clock gate, and should be <&infracfg CLK_INFRA_AO_SPI1>. +- clock-names: should be "spi" for the clock gate. + +Optional properties: +- assigned-clocks: it's mux clock, should be <&topckgen CLK_TOP_SPISLV_SEL>. +- assigned-clock-parents: parent of mux clock. + It's PLL, and should be one of the following. + - <&topckgen CLK_TOP_UNIVPLL1_D2>: specify parent clock 312MHZ. + It's the default one. + - <&topckgen CLK_TOP_UNIVPLL1_D4>: specify parent clock 156MHZ. + - <&topckgen CLK_TOP_UNIVPLL2_D4>: specify parent clock 104MHZ. + - <&topckgen CLK_TOP_UNIVPLL1_D8>: specify parent clock 78MHZ. + +Example: +- SoC Specific Portion: +spis1: spi@10013000 { + compatible = "mediatek,mt2712-spi-slave"; + reg = <0 0x10013000 0 0x100>; + interrupts = ; + clocks = <&infracfg CLK_INFRA_AO_SPI1>; + clock-names = "spi"; + assigned-clocks = <&topckgen CLK_TOP_SPISLV_SEL>; + assigned-clock-parents = <&topckgen CLK_TOP_UNIVPLL1_D2>; +}; diff --git a/Documentation/devicetree/bindings/spi/spi-sprd.txt b/Documentation/devicetree/bindings/spi/spi-sprd.txt new file mode 100644 index 000000000000..bad211a19da4 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-sprd.txt @@ -0,0 +1,26 @@ +Spreadtrum SPI Controller + +Required properties: +- compatible: Should be "sprd,sc9860-spi". +- reg: Offset and length of SPI controller register space. +- interrupts: Should contain SPI interrupt. +- clock-names: Should contain following entries: + "spi" for SPI clock, + "source" for SPI source (parent) clock, + "enable" for SPI module enable clock. +- clocks: List of clock input name strings sorted in the same order + as the clock-names property. +- #address-cells: The number of cells required to define a chip select + address on the SPI bus. Should be set to 1. +- #size-cells: Should be set to 0. + +Example: +spi0: spi@70a00000{ + compatible = "sprd,sc9860-spi"; + reg = <0 0x70a00000 0 0x1000>; + interrupts = ; + clock-names = "spi", "source","enable"; + clocks = <&clk_spi0>, <&ext_26m>, <&clk_ap_apb_gates 5>; + #address-cells = <1>; + #size-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt new file mode 100644 index 000000000000..adeeb63e84b9 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt @@ -0,0 +1,44 @@ +* STMicroelectronics Quad Serial Peripheral Interface(QSPI) + +Required properties: +- compatible: should be "st,stm32f469-qspi" +- reg: the first contains the register location and length. + the second contains the memory mapping address and length +- reg-names: should contain the reg names "qspi" "qspi_mm" +- interrupts: should contain the interrupt for the device +- clocks: the phandle of the clock needed by the QSPI controller +- A pinctrl must be defined to set pins in mode of operation for QSPI transfer + +Optional properties: +- resets: must contain the phandle to the reset controller. + +A spi flash (NOR/NAND) must be a child of spi node and could have some +properties. Also see jedec,spi-nor.txt. + +Required properties: +- reg: chip-Select number (QSPI controller may connect 2 flashes) +- spi-max-frequency: max frequency of spi bus + +Optional property: +- spi-rx-bus-width: see ./spi-bus.txt for the description + +Example: + +qspi: spi@a0001000 { + compatible = "st,stm32f469-qspi"; + reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>; + reg-names = "qspi", "qspi_mm"; + interrupts = <91>; + resets = <&rcc STM32F4_AHB3_RESET(QSPI)>; + clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_qspi0>; + + flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-rx-bus-width = <4>; + spi-max-frequency = <108000000>; + ... + }; +}; diff --git a/Documentation/devicetree/bindings/sram/sunxi-sram.txt b/Documentation/devicetree/bindings/sram/sunxi-sram.txt index c51ade86578c..62dd0748f0ef 100644 --- a/Documentation/devicetree/bindings/sram/sunxi-sram.txt +++ b/Documentation/devicetree/bindings/sram/sunxi-sram.txt @@ -18,6 +18,7 @@ Required properties: - "allwinner,sun8i-h3-system-control" - "allwinner,sun50i-a64-sram-controller" (deprecated) - "allwinner,sun50i-a64-system-control" + - "allwinner,sun50i-h6-system-control", "allwinner,sun50i-a64-system-control" - reg : sram controller register offset + length SRAM nodes @@ -54,6 +55,9 @@ The valid sections compatible for H3 are: The valid sections compatible for A64 are: - allwinner,sun50i-a64-sram-c +The valid sections compatible for H6 are: + - allwinner,sun50i-h6-sram-c, allwinner,sun50i-a64-sram-c + Devices using SRAM sections --------------------------- diff --git a/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt b/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt index 290ec06fa33a..0273a92a2a84 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt +++ b/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt @@ -6,8 +6,7 @@ interrupt signal and status register to identify high PMIC die temperature. Required properties: - compatible: Should contain "qcom,spmi-temp-alarm". -- reg: Specifies the SPMI address and length of the controller's - registers. +- reg: Specifies the SPMI address. - interrupts: PMIC temperature alarm interrupt. - #thermal-sensor-cells: Should be 0. See thermal.txt for a description. @@ -20,7 +19,7 @@ Example: pm8941_temp: thermal-alarm@2400 { compatible = "qcom,spmi-temp-alarm"; - reg = <0x2400 0x100>; + reg = <0x2400>; interrupts = <0 0x24 0 IRQ_TYPE_EDGE_RISING>; #thermal-sensor-cells = <0>; @@ -36,19 +35,14 @@ Example: thermal-sensors = <&pm8941_temp>; trips { - passive { - temperature = <1050000>; + stage1 { + temperature = <105000>; hysteresis = <2000>; type = "passive"; }; - alert { + stage2 { temperature = <125000>; hysteresis = <2000>; - type = "hot"; - }; - crit { - temperature = <145000>; - hysteresis = <2000>; type = "critical"; }; }; diff --git a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt index 20ca4ef9d776..04cbb90a5d3e 100644 --- a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt @@ -1,9 +1,9 @@ * Thermal Monitoring Unit (TMU) on Freescale QorIQ SoCs Required properties: -- compatible : Must include "fsl,qoriq-tmu". The version of the device is - determined by the TMU IP Block Revision Register (IPBRR0) at - offset 0x0BF8. +- compatible : Must include "fsl,qoriq-tmu" or "fsl,imx8mq-tmu". The + version of the device is determined by the TMU IP Block Revision + Register (IPBRR0) at offset 0x0BF8. Table of correspondences between IPBRR0 values and example chips: Value Device ---------- ----- diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt index cfa154bb0fa7..ad9a435afef4 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt @@ -7,9 +7,11 @@ inside the LSI. Required properties: - compatible : "renesas,-thermal", Examples with soctypes are: + - "renesas,r8a774a1-thermal" (RZ/G2M) - "renesas,r8a7795-thermal" (R-Car H3) - "renesas,r8a7796-thermal" (R-Car M3-W) - "renesas,r8a77965-thermal" (R-Car M3-N) + - "renesas,r8a77980-thermal" (R-Car V3H) - reg : Address ranges of the thermal registers. Each sensor needs one address range. Sorting must be done in increasing order according to datasheet, i.e. @@ -19,7 +21,8 @@ Required properties: Optional properties: -- interrupts : interrupts routed to the TSC (3 for H3, M3-W and M3-N) +- interrupts : interrupts routed to the TSC (3 for H3, M3-W, M3-N, + and V3H) - power-domain : Must contain a reference to the power domain. This property is mandatory if the thermal sensor instance is part of a controllable power domain. diff --git a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt index 67c563f1b4c4..73e1613d2cb0 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt @@ -4,15 +4,17 @@ Required properties: - compatible : "renesas,thermal-", "renesas,rcar-gen2-thermal" (with thermal-zone) or "renesas,rcar-thermal" (without thermal-zone) as - fallback except R-Car D3. + fallback except R-Car V3M/D3. Examples with soctypes are: - "renesas,thermal-r8a73a4" (R-Mobile APE6) - "renesas,thermal-r8a7743" (RZ/G1M) + - "renesas,thermal-r8a7744" (RZ/G1N) - "renesas,thermal-r8a7779" (R-Car H1) - "renesas,thermal-r8a7790" (R-Car H2) - "renesas,thermal-r8a7791" (R-Car M2-W) - "renesas,thermal-r8a7792" (R-Car V2H) - "renesas,thermal-r8a7793" (R-Car M2-N) + - "renesas,thermal-r8a77970" (R-Car V3M) - "renesas,thermal-r8a77995" (R-Car D3) - reg : Address range of the thermal registers. The 1st reg will be recognized as common register @@ -21,7 +23,7 @@ Required properties: Option properties: - interrupts : If present should contain 3 interrupts for - R-Car D3 or 1 interrupt otherwise. + R-Car V3M/D3 or 1 interrupt otherwise. Example (non interrupt support): diff --git a/Documentation/devicetree/bindings/thermal/stm32-thermal.txt b/Documentation/devicetree/bindings/thermal/stm32-thermal.txt new file mode 100644 index 000000000000..8c0d5a4d8031 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/stm32-thermal.txt @@ -0,0 +1,61 @@ +Binding for Thermal Sensor for STMicroelectronics STM32 series of SoCs. + +On STM32 SoCs, the Digital Temperature Sensor (DTS) is in charge of managing an +analog block which delivers a frequency depending on the internal SoC's +temperature. By using a reference frequency, DTS is able to provide a sample +number which can be translated into a temperature by the user. + +DTS provides interrupt notification mechanism by threshold. This mechanism +offers two temperature trip points: passive and critical. The first is intended +for passive cooling notification while the second is used for over-temperature +reset. + +Required parameters: +------------------- + +compatible: Should be "st,stm32-thermal" +reg: This should be the physical base address and length of the + sensor's registers. +clocks: Phandle of the clock used by the thermal sensor. + See: Documentation/devicetree/bindings/clock/clock-bindings.txt +clock-names: Should be "pclk" for register access clock and reference clock. + See: Documentation/devicetree/bindings/resource-names.txt +#thermal-sensor-cells: Should be 0. See ./thermal.txt for a description. +interrupts: Standard way to define interrupt number. + +Example: + + thermal-zones { + cpu_thermal: cpu-thermal { + polling-delay-passive = <0>; + polling-delay = <0>; + + thermal-sensors = <&thermal>; + + trips { + cpu_alert1: cpu-alert1 { + temperature = <85000>; + hysteresis = <0>; + type = "passive"; + }; + + cpu-crit: cpu-crit { + temperature = <120000>; + hysteresis = <0>; + type = "critical"; + }; + }; + + cooling-maps { + }; + }; + }; + + thermal: thermal@50028000 { + compatible = "st,stm32-thermal"; + reg = <0x50028000 0x100>; + clocks = <&rcc TMPSENS>; + clock-names = "pclk"; + #thermal-sensor-cells = <0>; + interrupts = ; + }; diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt index eb7ee91556a5..ca14ba959e0d 100644 --- a/Documentation/devicetree/bindings/thermal/thermal.txt +++ b/Documentation/devicetree/bindings/thermal/thermal.txt @@ -152,7 +152,7 @@ Optional property: Elem size: one cell the sensors listed in the thermal-sensors property. Elem type: signed Coefficients defaults to 1, in case this property is not specified. A simple linear polynomial is used: - Z = c0 * x0 + c1 + x1 + ... + c(n-1) * x(n-1) + cn. + Z = c0 * x0 + c1 * x1 + ... + c(n-1) * x(n-1) + cn. The coefficients are ordered and they match with sensors by means of sensor ID. Additional coefficients are diff --git a/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt b/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt new file mode 100644 index 000000000000..6b04344f4bea --- /dev/null +++ b/Documentation/devicetree/bindings/timer/csky,gx6605s-timer.txt @@ -0,0 +1,42 @@ +================= +gx6605s SOC Timer +================= + +The timer is used in gx6605s soc as system timer and the driver +contain clk event and clk source. + +============================== +timer node bindings definition +============================== + + Description: Describes gx6605s SOC timer + + PROPERTIES + + - compatible + Usage: required + Value type: + Definition: must be "csky,gx6605s-timer" + - reg + Usage: required + Value type: + Definition: in soc from cpu view + - clocks + Usage: required + Value type: phandle + clock specifier cells + Definition: must be input clk node + - interrupt + Usage: required + Value type: + Definition: must be timer irq num defined by soc + +Examples: +--------- + + timer0: timer@20a000 { + compatible = "csky,gx6605s-timer"; + reg = <0x0020a000 0x400>; + clocks = <&dummy_apb_clk>; + interrupts = <10>; + interrupt-parent = <&intc>; + }; diff --git a/Documentation/devicetree/bindings/timer/csky,mptimer.txt b/Documentation/devicetree/bindings/timer/csky,mptimer.txt new file mode 100644 index 000000000000..15cfec08fbb8 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/csky,mptimer.txt @@ -0,0 +1,42 @@ +============================ +C-SKY Multi-processors Timer +============================ + +C-SKY multi-processors timer is designed for C-SKY SMP system and the +regs is accessed by cpu co-processor 4 registers with mtcr/mfcr. + + - PTIM_CTLR "cr<0, 14>" Control reg to start reset timer. + - PTIM_TSR "cr<1, 14>" Interrupt cleanup status reg. + - PTIM_CCVR "cr<3, 14>" Current counter value reg. + - PTIM_LVR "cr<6, 14>" Window value reg to triger next event. + +============================== +timer node bindings definition +============================== + + Description: Describes SMP timer + + PROPERTIES + + - compatible + Usage: required + Value type: + Definition: must be "csky,mptimer" + - clocks + Usage: required + Value type: + Definition: must be input clk node + - interrupts + Usage: required + Value type: + Definition: must be timer irq num defined by soc + +Examples: +--------- + + timer: timer { + compatible = "csky,mptimer"; + clocks = <&dummy_apb_clk>; + interrupts = <16>; + interrupt-parent = <&intc>; + }; diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt index b40add2d9bb4..33992679a8bd 100644 --- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt +++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt @@ -24,6 +24,8 @@ Required Properties: - "renesas,r8a73a4-cmt1" for the 48-bit CMT1 device included in r8a73a4. - "renesas,r8a7743-cmt0" for the 32-bit CMT0 device included in r8a7743. - "renesas,r8a7743-cmt1" for the 48-bit CMT1 device included in r8a7743. + - "renesas,r8a7744-cmt0" for the 32-bit CMT0 device included in r8a7744. + - "renesas,r8a7744-cmt1" for the 48-bit CMT1 device included in r8a7744. - "renesas,r8a7745-cmt0" for the 32-bit CMT0 device included in r8a7745. - "renesas,r8a7745-cmt1" for the 48-bit CMT1 device included in r8a7745. - "renesas,r8a7790-cmt0" for the 32-bit CMT0 device included in r8a7790. @@ -34,6 +36,10 @@ Required Properties: - "renesas,r8a7793-cmt1" for the 48-bit CMT1 device included in r8a7793. - "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794. - "renesas,r8a7794-cmt1" for the 48-bit CMT1 device included in r8a7794. + - "renesas,r8a77970-cmt0" for the 32-bit CMT0 device included in r8a77970. + - "renesas,r8a77970-cmt1" for the 48-bit CMT1 device included in r8a77970. + - "renesas,r8a77980-cmt0" for the 32-bit CMT0 device included in r8a77980. + - "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980. - "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2 and RZ/G1. @@ -41,6 +47,9 @@ Required Properties: and RZ/G1. These are fallbacks for r8a73a4, R-Car Gen2 and RZ/G1 entries listed above. + - "renesas,rcar-gen3-cmt0" for 32-bit CMT0 devices included in R-Car Gen3. + - "renesas,rcar-gen3-cmt1" for 48-bit CMT1 devices included in R-Car Gen3. + These are fallbacks for R-Car Gen3 entries listed above. - reg: base address and length of the registers block for the timer module. - interrupts: interrupt-specifier for the timer, one per channel. diff --git a/Documentation/devicetree/bindings/timer/renesas,ostm.txt b/Documentation/devicetree/bindings/timer/renesas,ostm.txt index be3ae0fdf775..81a78f8bcf17 100644 --- a/Documentation/devicetree/bindings/timer/renesas,ostm.txt +++ b/Documentation/devicetree/bindings/timer/renesas,ostm.txt @@ -9,7 +9,8 @@ Channels are independent from each other. Required Properties: - compatible: must be one or more of the following: - - "renesas,r7s72100-ostm" for the r7s72100 OSTM + - "renesas,r7s72100-ostm" for the R7S72100 (RZ/A1) OSTM + - "renesas,r7s9210-ostm" for the R7S9210 (RZ/A2) OSTM - "renesas,ostm" for any OSTM This is a fallback for the above renesas,*-ostm entries diff --git a/Documentation/devicetree/bindings/timer/renesas,tmu.txt b/Documentation/devicetree/bindings/timer/renesas,tmu.txt index cd5f20bf2582..4ddff85837da 100644 --- a/Documentation/devicetree/bindings/timer/renesas,tmu.txt +++ b/Documentation/devicetree/bindings/timer/renesas,tmu.txt @@ -12,6 +12,8 @@ Required Properties: - "renesas,tmu-r8a7740" for the r8a7740 TMU - "renesas,tmu-r8a7778" for the r8a7778 TMU - "renesas,tmu-r8a7779" for the r8a7779 TMU + - "renesas,tmu-r8a77970" for the r8a77970 TMU + - "renesas,tmu-r8a77980" for the r8a77980 TMU - "renesas,tmu" for any TMU. This is a fallback for the above renesas,tmu-* entries diff --git a/Documentation/devicetree/bindings/trivial-devices.txt b/Documentation/devicetree/bindings/trivial-devices.txt index 763a2808a95c..6ab001fa1ed4 100644 --- a/Documentation/devicetree/bindings/trivial-devices.txt +++ b/Documentation/devicetree/bindings/trivial-devices.txt @@ -21,21 +21,10 @@ adi,adt7490 +/-1C TDM Extended Temp Range I.C adi,adxl345 Three-Axis Digital Accelerometer adi,adxl346 Three-Axis Digital Accelerometer (backward-compatibility value "adi,adxl345" must be listed too) ams,iaq-core AMS iAQ-Core VOC Sensor -amstaos,tsl2571 AMS/TAOS ALS and proximity sensor -amstaos,tsl2671 AMS/TAOS ALS and proximity sensor -amstaos,tmd2671 AMS/TAOS ALS and proximity sensor -amstaos,tsl2771 AMS/TAOS ALS and proximity sensor -amstaos,tmd2771 AMS/TAOS ALS and proximity sensor -amstaos,tsl2572 AMS/TAOS ALS and proximity sensor -amstaos,tsl2672 AMS/TAOS ALS and proximity sensor -amstaos,tmd2672 AMS/TAOS ALS and proximity sensor -amstaos,tsl2772 AMS/TAOS ALS and proximity sensor -amstaos,tmd2772 AMS/TAOS ALS and proximity sensor at,24c08 i2c serial eeprom (24cxx) atmel,at97sc3204t i2c trusted platform module (TPM) capella,cm32181 CM32181: Ambient Light Sensor capella,cm3232 CM3232: Ambient Light Sensor -cirrus,cs42l51 Cirrus Logic CS42L51 audio codec dallas,ds1374 I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output dallas,ds1631 High-Precision Digital Thermometer dallas,ds1672 Dallas DS1672 Real-time Clock diff --git a/Documentation/devicetree/bindings/usb/ci-hdrc-usb2.txt b/Documentation/devicetree/bindings/usb/ci-hdrc-usb2.txt index 2e9318151df7..529e51879fb2 100644 --- a/Documentation/devicetree/bindings/usb/ci-hdrc-usb2.txt +++ b/Documentation/devicetree/bindings/usb/ci-hdrc-usb2.txt @@ -80,6 +80,8 @@ Optional properties: controller. It's expected that a mux state of 0 indicates device mode and a mux state of 1 indicates host mode. - mux-control-names: Shall be "usb_switch" if mux-controls is specified. +- pinctrl-names: Names for optional pin modes in "default", "host", "device" +- pinctrl-n: alternate pin modes i.mx specific properties - fsl,usbmisc: phandler of non-core register device, with one diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt index 46da5f184460..6dc3c4a34483 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.txt +++ b/Documentation/devicetree/bindings/usb/dwc2.txt @@ -6,6 +6,7 @@ Required properties: - brcm,bcm2835-usb: The DWC2 USB controller instance in the BCM2835 SoC. - hisilicon,hi6220-usb: The DWC2 USB controller instance in the hi6220 SoC. - rockchip,rk3066-usb: The DWC2 USB controller instance in the rk3066 Soc; + - "rockchip,px30-usb", "rockchip,rk3066-usb", "snps,dwc2": for px30 Soc; - "rockchip,rk3188-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3188 Soc; - "rockchip,rk3288-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3288 Soc; - "lantiq,arx100-usb": The DWC2 USB controller instance in Lantiq ARX SoCs; diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt b/Documentation/devicetree/bindings/usb/dwc3.txt index 3e4c38b806ac..636630fb92d7 100644 --- a/Documentation/devicetree/bindings/usb/dwc3.txt +++ b/Documentation/devicetree/bindings/usb/dwc3.txt @@ -19,6 +19,7 @@ Exception for clocks: "cavium,octeon-7130-usb-uctl" "qcom,dwc3" "samsung,exynos5250-dwusb3" + "samsung,exynos5433-dwusb3" "samsung,exynos7-dwusb3" "sprd,sc9860-dwc3" "st,stih407-dwc3" diff --git a/Documentation/devicetree/bindings/usb/ehci-mv.txt b/Documentation/devicetree/bindings/usb/ehci-mv.txt new file mode 100644 index 000000000000..335589895763 --- /dev/null +++ b/Documentation/devicetree/bindings/usb/ehci-mv.txt @@ -0,0 +1,23 @@ +* Marvell PXA/MMP EHCI controller. + +Required properties: + +- compatible: must be "marvell,pxau2o-ehci" +- reg: physical base addresses of the controller and length of memory mapped region +- interrupts: one EHCI controller interrupt should be described here +- clocks: phandle list of usb clocks +- clock-names: should be "USBCLK" +- phys: phandle for the PHY device +- phy-names: should be "usb" + +Example: + + ehci0: usb-ehci@d4208000 { + compatible = "marvell,pxau2o-ehci"; + reg = <0xd4208000 0x200>; + interrupts = <44>; + clocks = <&soc_clocks MMP2_CLK_USB>; + clock-names = "USBCLK"; + phys = <&usb_otg_phy>; + phy-names = "usb"; + }; diff --git a/Documentation/devicetree/bindings/usb/exynos-usb.txt b/Documentation/devicetree/bindings/usb/exynos-usb.txt index c97374315049..b7111f43fa59 100644 --- a/Documentation/devicetree/bindings/usb/exynos-usb.txt +++ b/Documentation/devicetree/bindings/usb/exynos-usb.txt @@ -83,6 +83,8 @@ Required properties: - compatible: should be one of the following - "samsung,exynos5250-dwusb3": for USB 3.0 DWC3 controller on Exynos5250/5420. + "samsung,exynos5433-dwusb3": for USB 3.0 DWC3 controller on + Exynos5433. "samsung,exynos7-dwusb3": for USB 3.0 DWC3 controller on Exynos7. - #address-cells, #size-cells : should be '1' if the device has sub-nodes with 'reg' property. diff --git a/Documentation/devicetree/bindings/usb/faraday,fotg210.txt b/Documentation/devicetree/bindings/usb/faraday,fotg210.txt new file mode 100644 index 000000000000..06a2286e2054 --- /dev/null +++ b/Documentation/devicetree/bindings/usb/faraday,fotg210.txt @@ -0,0 +1,35 @@ +Faraday FOTG Host controller + +This OTG-capable USB host controller is found in Cortina Systems +Gemini and other SoC products. + +Required properties: +- compatible: should be one of: + "faraday,fotg210" + "cortina,gemini-usb", "faraday,fotg210" +- reg: should contain one register range i.e. start and length +- interrupts: description of the interrupt line + +Optional properties: +- clocks: should contain the IP block clock +- clock-names: should be "PCLK" for the IP block clock + +Required properties for "cortina,gemini-usb" compatible: +- syscon: a phandle to the system controller to access PHY registers + +Optional properties for "cortina,gemini-usb" compatible: +- cortina,gemini-mini-b: boolean property that indicates that a Mini-B + OTG connector is in use +- wakeup-source: see power/wakeup-source.txt + +Example for Gemini: + +usb@68000000 { + compatible = "cortina,gemini-usb", "faraday,fotg210"; + reg = <0x68000000 0x1000>; + interrupts = <10 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&cc 12>; + clock-names = "PCLK"; + syscon = <&syscon>; + wakeup-source; +}; diff --git a/Documentation/devicetree/bindings/usb/fcs,fusb302.txt b/Documentation/devicetree/bindings/usb/fcs,fusb302.txt index 6087dc7f209e..a5d011d2efc8 100644 --- a/Documentation/devicetree/bindings/usb/fcs,fusb302.txt +++ b/Documentation/devicetree/bindings/usb/fcs,fusb302.txt @@ -5,10 +5,19 @@ Required properties : - reg : I2C slave address - interrupts : Interrupt specifier -Optional properties : -- fcs,operating-sink-microwatt : - Minimum amount of power accepted from a sink - when negotiating +Required sub-node: +- connector : The "usb-c-connector" attached to the FUSB302 IC. The bindings + of the connector node are specified in: + + Documentation/devicetree/bindings/connector/usb-connector.txt + +Deprecated properties : +- fcs,max-sink-microvolt : Maximum sink voltage accepted by port controller +- fcs,max-sink-microamp : Maximum sink current accepted by port controller +- fcs,max-sink-microwatt : Maximum sink power accepted by port controller +- fcs,operating-sink-microwatt : Minimum amount of power accepted from a sink + when negotiating + Example: @@ -17,7 +26,16 @@ fusb302: typec-portc@54 { reg = <0x54>; interrupt-parent = <&nmi_intc>; interrupts = <0 IRQ_TYPE_LEVEL_LOW>; - fcs,max-sink-microvolt = <12000000>; - fcs,max-sink-microamp = <3000000>; - fcs,max-sink-microwatt = <36000000>; + + usb_con: connector { + compatible = "usb-c-connector"; + label = "USB-C"; + power-role = "dual"; + try-power-role = "sink"; + source-pdos = ; + sink-pdos = ; + op-sink-microwatt = <10000000>; + }; }; diff --git a/Documentation/devicetree/bindings/usb/renesas_usb3.txt b/Documentation/devicetree/bindings/usb/renesas_usb3.txt index 2c071bb5801e..d366555166d0 100644 --- a/Documentation/devicetree/bindings/usb/renesas_usb3.txt +++ b/Documentation/devicetree/bindings/usb/renesas_usb3.txt @@ -2,11 +2,13 @@ Renesas Electronics USB3.0 Peripheral driver Required properties: - compatible: Must contain one of the following: + - "renesas,r8a774a1-usb3-peri" - "renesas,r8a7795-usb3-peri" - "renesas,r8a7796-usb3-peri" - "renesas,r8a77965-usb3-peri" - - "renesas,rcar-gen3-usb3-peri" for a generic R-Car Gen3 compatible - device + - "renesas,r8a77990-usb3-peri" + - "renesas,rcar-gen3-usb3-peri" for a generic R-Car Gen3 or RZ/G2 + compatible device When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt index 43960faf5a88..90719f501852 100644 --- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt +++ b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt @@ -4,7 +4,9 @@ Required properties: - compatible: Must contain one or more of the following: - "renesas,usbhs-r8a7743" for r8a7743 (RZ/G1M) compatible device + - "renesas,usbhs-r8a7744" for r8a7744 (RZ/G1N) compatible device - "renesas,usbhs-r8a7745" for r8a7745 (RZ/G1E) compatible device + - "renesas,usbhs-r8a774a1" for r8a774a1 (RZ/G2M) compatible device - "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device - "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device - "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device @@ -13,10 +15,11 @@ Required properties: - "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device - "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device - "renesas,usbhs-r8a77965" for r8a77965 (R-Car M3-N) compatible device + - "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device - "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device - "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device - "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices - - "renesas,rcar-gen3-usbhs" for R-Car Gen3 compatible device + - "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices - "renesas,rza1-usbhs" for RZ/A1 compatible device When compatible with the generic version, nodes must list the @@ -25,7 +28,11 @@ Required properties: - reg: Base address and length of the register for the USBHS - interrupts: Interrupt specifier for the USBHS - - clocks: A list of phandle + clock specifier pairs + - clocks: A list of phandle + clock specifier pairs. + - In case of "renesas,rcar-gen3-usbhs", two clocks are required. + First clock should be peripheral and second one should be host. + - In case of except above, one clock is required. First clock + should be peripheral. Optional properties: - renesas,buswait: Integer to use BUSWAIT register diff --git a/Documentation/devicetree/bindings/usb/usb-ehci.txt b/Documentation/devicetree/bindings/usb/usb-ehci.txt index 0f1b75386207..406252d14c6b 100644 --- a/Documentation/devicetree/bindings/usb/usb-ehci.txt +++ b/Documentation/devicetree/bindings/usb/usb-ehci.txt @@ -15,7 +15,11 @@ Optional properties: - needs-reset-on-resume : boolean, set this to force EHCI reset after resume - has-transaction-translator : boolean, set this if EHCI have a Transaction Translator built into the root hub. - - clocks : a list of phandle + clock specifier pairs + - clocks : a list of phandle + clock specifier pairs. In case of Renesas + R-Car Gen3 SoCs: + - if a host only channel: first clock should be host. + - if a USB DRD channel: first clock should be host and second one + should be peripheral. - phys : see usb-hcd.txt in the current directory - resets : phandle + reset specifier pair diff --git a/Documentation/devicetree/bindings/usb/usb-ohci.txt b/Documentation/devicetree/bindings/usb/usb-ohci.txt index a8d2103d1f3d..aaaa5255c972 100644 --- a/Documentation/devicetree/bindings/usb/usb-ohci.txt +++ b/Documentation/devicetree/bindings/usb/usb-ohci.txt @@ -12,7 +12,11 @@ Optional properties: - no-big-frame-no : boolean, set if frame_no lives in bits [15:0] of HCCA - remote-wakeup-connected: remote wakeup is wired on the platform - num-ports : u32, to override the detected port count -- clocks : a list of phandle + clock specifier pairs +- clocks : a list of phandle + clock specifier pairs. In case of Renesas + R-Car Gen3 SoCs: + - if a host only channel: first clock should be host. + - if a USB DRD channel: first clock should be host and second one + should be peripheral. - phys : see usb-hcd.txt in the current directory - resets : a list of phandle + reset specifier pairs diff --git a/Documentation/devicetree/bindings/usb/usb-xhci.txt b/Documentation/devicetree/bindings/usb/usb-xhci.txt index ac4cd0d6195a..fea8b1545751 100644 --- a/Documentation/devicetree/bindings/usb/usb-xhci.txt +++ b/Documentation/devicetree/bindings/usb/usb-xhci.txt @@ -8,6 +8,8 @@ Required properties: - "marvell,armada-375-xhci" for Armada 375 SoCs - "marvell,armada-380-xhci" for Armada 38x SoCs - "renesas,xhci-r8a7743" for r8a7743 SoC + - "renesas,xhci-r8a7744" for r8a7744 SoC + - "renesas,xhci-r8a774a1" for r8a774a1 SoC - "renesas,xhci-r8a7790" for r8a7790 SoC - "renesas,xhci-r8a7791" for r8a7791 SoC - "renesas,xhci-r8a7793" for r8a7793 SoC @@ -17,7 +19,8 @@ Required properties: - "renesas,xhci-r8a77990" for r8a77990 SoC - "renesas,rcar-gen2-xhci" for a generic R-Car Gen2 or RZ/G1 compatible device - - "renesas,rcar-gen3-xhci" for a generic R-Car Gen3 compatible device + - "renesas,rcar-gen3-xhci" for a generic R-Car Gen3 or RZ/G2 compatible + device - "xhci-platform" (deprecated) When compatible with the generic version, nodes must list the diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 2c3fc512e746..4b1a2a8fcc16 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -84,6 +84,7 @@ cosmic Cosmic Circuits crane Crane Connectivity Solutions creative Creative Technology Ltd crystalfontz Crystalfontz America, Inc. +csky Hangzhou C-SKY Microsystems Co., Ltd cubietech Cubietech, Ltd. cypress Cypress Semiconductor Corporation cznic CZ.NIC, z.s.p.o. @@ -114,6 +115,7 @@ elan Elan Microelectronic Corp. embest Shenzhen Embest Technology Co., Ltd. emmicro EM Microelectronic emtrion emtrion GmbH +endless Endless Mobile, Inc. energymicro Silicon Laboratories (formerly Energy Micro AS) engicam Engicam S.r.l. epcos EPCOS AG @@ -127,6 +129,7 @@ everspin Everspin Technologies, Inc. exar Exar Corporation excito Excito ezchip EZchip Semiconductor +facebook Facebook fairphone Fairphone B.V. faraday Faraday Technology Corporation fastrax Fastrax Oy @@ -235,6 +238,7 @@ micrel Micrel Inc. microchip Microchip Technology Inc. microcrystal Micro Crystal AG micron Micron Technology Inc. +mikroe MikroElektronika d.o.o. minix MINIX Technology Ltd. miramems MiraMEMS Sensing Technology Co., Ltd. mitsubishi Mitsubishi Electric Corporation @@ -274,6 +278,7 @@ nxp NXP Semiconductors okaya Okaya Electric America, Inc. oki Oki Electric Industry Co., Ltd. olimex OLIMEX Ltd. +olpc One Laptop Per Child onion Onion Corporation onnn ON Semiconductor Corp. ontat On Tat Industrial Company @@ -297,6 +302,7 @@ pine64 Pine64 pixcir PIXCIR MICROELECTRONICS Co., Ltd plathome Plat'Home Co., Ltd. plda PLDA +plx Broadcom Corporation (formerly PLX Technology) portwell Portwell Inc. poslab Poslab Technology Co., Ltd. powervr PowerVR (deprecated, use img) diff --git a/Documentation/devicetree/bindings/watchdog/armada-37xx-wdt.txt b/Documentation/devicetree/bindings/watchdog/armada-37xx-wdt.txt new file mode 100644 index 000000000000..a8d00c31a1d8 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/armada-37xx-wdt.txt @@ -0,0 +1,23 @@ +* Armada 37xx CPU Watchdog Timer Controller + +Required properties: +- compatible : must be "marvell,armada-3700-wdt" +- reg : base physical address of the controller and length of memory mapped + region. +- clocks : the clock feeding the watchdog timer. See clock-bindings.txt +- marvell,system-controller : reference to syscon node for the CPU Miscellaneous + Registers + +Example: + + cpu_misc: system-controller@d000 { + compatible = "marvell,armada-3700-cpu-misc", "syscon"; + reg = <0xd000 0x1000>; + }; + + wdt: watchdog@8300 { + compatible = "marvell,armada-3700-wdt"; + reg = <0x8300 0x40>; + marvell,system-controller = <&cpu_misc>; + clocks = <&xtalclk>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/mpc8xxx-wdt.txt b/Documentation/devicetree/bindings/watchdog/mpc8xxx-wdt.txt new file mode 100644 index 000000000000..a384ff5b3ce8 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/mpc8xxx-wdt.txt @@ -0,0 +1,25 @@ +* Freescale mpc8xxx watchdog driver (For 83xx, 86xx and 8xx) + +Required properties: +- compatible: Shall contain one of the following: + "mpc83xx_wdt" for an mpc83xx + "fsl,mpc8610-wdt" for an mpc86xx + "fsl,mpc823-wdt" for an mpc8xx +- reg: base physical address and length of the area hosting the + watchdog registers. + On the 83xx, "Watchdog Timer Registers" area: <0x200 0x100> + On the 86xx, "Watchdog Timer Registers" area: <0xe4000 0x100> + On the 8xx, "General System Interface Unit" area: <0x0 0x10> + +Optional properties: +- reg: additional physical address and length (4) of location of the + Reset Status Register (called RSTRSCR on the mpc86xx) + On the 83xx, it is located at offset 0x910 + On the 86xx, it is located at offset 0xe0094 + On the 8xx, it is located at offset 0x288 + +Example: + WDT: watchdog@0 { + compatible = "fsl,mpc823-wdt"; + reg = <0x0 0x10 0x288 0x4>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt index 9407212a85a8..a8ee29fd9ac8 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt @@ -6,6 +6,7 @@ Required properties: version. Examples with soctypes are: - "renesas,r8a7743-wdt" (RZ/G1M) + - "renesas,r8a7744-wdt" (RZ/G1N) - "renesas,r8a7745-wdt" (RZ/G1E) - "renesas,r8a774a1-wdt" (RZ/G2M) - "renesas,r8a7790-wdt" (R-Car H2) @@ -20,6 +21,7 @@ Required properties: - "renesas,r8a77990-wdt" (R-Car E3) - "renesas,r8a77995-wdt" (R-Car D3) - "renesas,r7s72100-wdt" (RZ/A1) + - "renesas,r7s9210-wdt" (RZ/A2) The generic compatible string must be: - "renesas,rza-wdt" for RZ/A - "renesas,rcar-gen2-wdt" for R-Car Gen2 and RZ/G1 diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index 826e85d50a16..e970fadf4d1a 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -121,6 +121,9 @@ Kernel utility functions .. kernel-doc:: kernel/rcu/update.c :export: +.. kernel-doc:: include/linux/overflow.h + :internal: + Device Resource Management -------------------------- diff --git a/Documentation/driver-api/firewire.rst b/Documentation/driver-api/firewire.rst new file mode 100644 index 000000000000..94a2d7f01d99 --- /dev/null +++ b/Documentation/driver-api/firewire.rst @@ -0,0 +1,48 @@ +=========================================== +Firewire (IEEE 1394) driver Interface Guide +=========================================== + +Introduction and Overview +========================= + +The Linux FireWire subsystem adds some interfaces into the Linux system to + use/maintain+any resource on IEEE 1394 bus. + +The main purpose of these interfaces is to access address space on each node +on IEEE 1394 bus by ISO/IEC 13213 (IEEE 1212) procedure, and to control +isochronous resources on the bus by IEEE 1394 procedure. + +Two types of interfaces are added, according to consumers of the interface. A +set of userspace interfaces is available via `firewire character devices`. A set +of kernel interfaces is available via exported symbols in `firewire-core` module. + +Firewire char device data structures +==================================== + +.. include:: /ABI/stable/firewire-cdev + :literal: + +.. kernel-doc:: include/uapi/linux/firewire-cdev.h + :internal: + +Firewire device probing and sysfs interfaces +============================================ + +.. include:: /ABI/stable/sysfs-bus-firewire + :literal: + +.. kernel-doc:: drivers/firewire/core-device.c + :export: + +Firewire core transaction interfaces +==================================== + +.. kernel-doc:: drivers/firewire/core-transaction.c + :export: + +Firewire Isochronous I/O interfaces +=================================== + +.. kernel-doc:: drivers/firewire/core-iso.c + :export: + diff --git a/Documentation/driver-api/fpga/fpga-bridge.rst b/Documentation/driver-api/fpga/fpga-bridge.rst index 2c2aaca894bf..71c5a40da320 100644 --- a/Documentation/driver-api/fpga/fpga-bridge.rst +++ b/Documentation/driver-api/fpga/fpga-bridge.rst @@ -4,6 +4,12 @@ FPGA Bridge API to implement a new FPGA bridge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* struct :c:type:`fpga_bridge` — The FPGA Bridge structure +* struct :c:type:`fpga_bridge_ops` — Low level Bridge driver ops +* :c:func:`devm_fpga_bridge_create()` — Allocate and init a bridge struct +* :c:func:`fpga_bridge_register()` — Register a bridge +* :c:func:`fpga_bridge_unregister()` — Unregister a bridge + .. kernel-doc:: include/linux/fpga/fpga-bridge.h :functions: fpga_bridge @@ -11,39 +17,10 @@ API to implement a new FPGA bridge :functions: fpga_bridge_ops .. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_create - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_free + :functions: devm_fpga_bridge_create .. kernel-doc:: drivers/fpga/fpga-bridge.c :functions: fpga_bridge_register .. kernel-doc:: drivers/fpga/fpga-bridge.c :functions: fpga_bridge_unregister - -API to control an FPGA bridge -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You probably won't need these directly. FPGA regions should handle this. - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: of_fpga_bridge_get - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_get - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_put - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_get_to_list - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: of_fpga_bridge_get_to_list - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_enable - -.. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_disable diff --git a/Documentation/driver-api/fpga/fpga-mgr.rst b/Documentation/driver-api/fpga/fpga-mgr.rst index 4b3825da48d9..576f1945eacd 100644 --- a/Documentation/driver-api/fpga/fpga-mgr.rst +++ b/Documentation/driver-api/fpga/fpga-mgr.rst @@ -49,18 +49,14 @@ probe function calls fpga_mgr_register(), such as:: * them in priv */ - mgr = fpga_mgr_create(dev, "Altera SOCFPGA FPGA Manager", - &socfpga_fpga_ops, priv); + mgr = devm_fpga_mgr_create(dev, "Altera SOCFPGA FPGA Manager", + &socfpga_fpga_ops, priv); if (!mgr) return -ENOMEM; platform_set_drvdata(pdev, mgr); - ret = fpga_mgr_register(mgr); - if (ret) - fpga_mgr_free(mgr); - - return ret; + return fpga_mgr_register(mgr); } static int socfpga_fpga_remove(struct platform_device *pdev) @@ -102,67 +98,19 @@ The ops include a .state function which will determine the state the FPGA is in and return a code of type enum fpga_mgr_states. It doesn't result in a change in state. -How to write an image buffer to a supported FPGA ------------------------------------------------- - -Some sample code:: - - #include - - struct fpga_manager *mgr; - struct fpga_image_info *info; - int ret; - - /* - * Get a reference to FPGA manager. The manager is not locked, so you can - * hold onto this reference without it preventing programming. - * - * This example uses the device node of the manager. Alternatively, use - * fpga_mgr_get(dev) instead if you have the device. - */ - mgr = of_fpga_mgr_get(mgr_node); - - /* struct with information about the FPGA image to program. */ - info = fpga_image_info_alloc(dev); - - /* flags indicates whether to do full or partial reconfiguration */ - info->flags = FPGA_MGR_PARTIAL_RECONFIG; - - /* - * At this point, indicate where the image is. This is pseudo-code; you're - * going to use one of these three. - */ - if (image is in a scatter gather table) { - - info->sgt = [your scatter gather table] - - } else if (image is in a buffer) { - - info->buf = [your image buffer] - info->count = [image buffer size] - - } else if (image is in a firmware file) { - - info->firmware_name = devm_kstrdup(dev, firmware_name, GFP_KERNEL); - - } - - /* Get exclusive control of FPGA manager */ - ret = fpga_mgr_lock(mgr); - - /* Load the buffer to the FPGA */ - ret = fpga_mgr_buf_load(mgr, &info, buf, count); - - /* Release the FPGA manager */ - fpga_mgr_unlock(mgr); - fpga_mgr_put(mgr); - - /* Deallocate the image info if you're done with it */ - fpga_image_info_free(info); - API for implementing a new FPGA Manager driver ---------------------------------------------- +* ``fpga_mgr_states`` — Values for :c:member:`fpga_manager->state`. +* struct :c:type:`fpga_manager` — the FPGA manager struct +* struct :c:type:`fpga_manager_ops` — Low level FPGA manager driver ops +* :c:func:`devm_fpga_mgr_create` — Allocate and init a manager struct +* :c:func:`fpga_mgr_register` — Register an FPGA manager +* :c:func:`fpga_mgr_unregister` — Unregister an FPGA manager + +.. kernel-doc:: include/linux/fpga/fpga-mgr.h + :functions: fpga_mgr_states + .. kernel-doc:: include/linux/fpga/fpga-mgr.h :functions: fpga_manager @@ -170,51 +118,10 @@ API for implementing a new FPGA Manager driver :functions: fpga_manager_ops .. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_create - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_free + :functions: devm_fpga_mgr_create .. kernel-doc:: drivers/fpga/fpga-mgr.c :functions: fpga_mgr_register .. kernel-doc:: drivers/fpga/fpga-mgr.c :functions: fpga_mgr_unregister - -API for programming an FPGA ---------------------------- - -.. kernel-doc:: include/linux/fpga/fpga-mgr.h - :functions: fpga_image_info - -.. kernel-doc:: include/linux/fpga/fpga-mgr.h - :functions: fpga_mgr_states - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_image_info_alloc - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_image_info_free - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: of_fpga_mgr_get - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_get - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_put - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_lock - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_unlock - -.. kernel-doc:: include/linux/fpga/fpga-mgr.h - :functions: fpga_mgr_states - -Note - use :c:func:`fpga_region_program_fpga()` instead of :c:func:`fpga_mgr_load()` - -.. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_load diff --git a/Documentation/driver-api/fpga/fpga-programming.rst b/Documentation/driver-api/fpga/fpga-programming.rst new file mode 100644 index 000000000000..b5484df6ff0f --- /dev/null +++ b/Documentation/driver-api/fpga/fpga-programming.rst @@ -0,0 +1,107 @@ +In-kernel API for FPGA Programming +================================== + +Overview +-------- + +The in-kernel API for FPGA programming is a combination of APIs from +FPGA manager, bridge, and regions. The actual function used to +trigger FPGA programming is :c:func:`fpga_region_program_fpga()`. + +:c:func:`fpga_region_program_fpga()` uses functionality supplied by +the FPGA manager and bridges. It will: + + * lock the region's mutex + * lock the mutex of the region's FPGA manager + * build a list of FPGA bridges if a method has been specified to do so + * disable the bridges + * program the FPGA using info passed in :c:member:`fpga_region->info`. + * re-enable the bridges + * release the locks + +The struct fpga_image_info specifies what FPGA image to program. It is +allocated/freed by :c:func:`fpga_image_info_alloc()` and freed with +:c:func:`fpga_image_info_free()` + +How to program an FPGA using a region +------------------------------------- + +When the FPGA region driver probed, it was given a pointer to an FPGA manager +driver so it knows which manager to use. The region also either has a list of +bridges to control during programming or it has a pointer to a function that +will generate that list. Here's some sample code of what to do next:: + + #include + #include + + struct fpga_image_info *info; + int ret; + + /* + * First, alloc the struct with information about the FPGA image to + * program. + */ + info = fpga_image_info_alloc(dev); + if (!info) + return -ENOMEM; + + /* Set flags as needed, such as: */ + info->flags = FPGA_MGR_PARTIAL_RECONFIG; + + /* + * Indicate where the FPGA image is. This is pseudo-code; you're + * going to use one of these three. + */ + if (image is in a scatter gather table) { + + info->sgt = [your scatter gather table] + + } else if (image is in a buffer) { + + info->buf = [your image buffer] + info->count = [image buffer size] + + } else if (image is in a firmware file) { + + info->firmware_name = devm_kstrdup(dev, firmware_name, + GFP_KERNEL); + + } + + /* Add info to region and do the programming */ + region->info = info; + ret = fpga_region_program_fpga(region); + + /* Deallocate the image info if you're done with it */ + region->info = NULL; + fpga_image_info_free(info); + + if (ret) + return ret; + + /* Now enumerate whatever hardware has appeared in the FPGA. */ + +API for programming an FPGA +--------------------------- + +* :c:func:`fpga_region_program_fpga` — Program an FPGA +* :c:type:`fpga_image_info` — Specifies what FPGA image to program +* :c:func:`fpga_image_info_alloc()` — Allocate an FPGA image info struct +* :c:func:`fpga_image_info_free()` — Free an FPGA image info struct + +.. kernel-doc:: drivers/fpga/fpga-region.c + :functions: fpga_region_program_fpga + +FPGA Manager flags + +.. kernel-doc:: include/linux/fpga/fpga-mgr.h + :doc: FPGA Manager flags + +.. kernel-doc:: include/linux/fpga/fpga-mgr.h + :functions: fpga_image_info + +.. kernel-doc:: drivers/fpga/fpga-mgr.c + :functions: fpga_image_info_alloc + +.. kernel-doc:: drivers/fpga/fpga-mgr.c + :functions: fpga_image_info_free diff --git a/Documentation/driver-api/fpga/fpga-region.rst b/Documentation/driver-api/fpga/fpga-region.rst index f30333ce828e..0529b2d2231a 100644 --- a/Documentation/driver-api/fpga/fpga-region.rst +++ b/Documentation/driver-api/fpga/fpga-region.rst @@ -34,41 +34,6 @@ fpga_image_info including: * flags indicating specifics such as whether the image is for partial reconfiguration. -How to program an FPGA using a region -------------------------------------- - -First, allocate the info struct:: - - info = fpga_image_info_alloc(dev); - if (!info) - return -ENOMEM; - -Set flags as needed, i.e.:: - - info->flags |= FPGA_MGR_PARTIAL_RECONFIG; - -Point to your FPGA image, such as:: - - info->sgt = &sgt; - -Add info to region and do the programming:: - - region->info = info; - ret = fpga_region_program_fpga(region); - -:c:func:`fpga_region_program_fpga()` operates on info passed in the -fpga_image_info (region->info). This function will attempt to: - - * lock the region's mutex - * lock the region's FPGA manager - * build a list of FPGA bridges if a method has been specified to do so - * disable the bridges - * program the FPGA - * re-enable the bridges - * release the locks - -Then you will want to enumerate whatever hardware has appeared in the FPGA. - How to add a new FPGA region ---------------------------- @@ -77,26 +42,62 @@ An example of usage can be seen in the probe function of [#f2]_. .. [#f1] ../devicetree/bindings/fpga/fpga-region.txt .. [#f2] ../../drivers/fpga/of-fpga-region.c -API to program an FPGA ----------------------- - -.. kernel-doc:: drivers/fpga/fpga-region.c - :functions: fpga_region_program_fpga - API to add a new FPGA region ---------------------------- +* struct :c:type:`fpga_region` — The FPGA region struct +* :c:func:`devm_fpga_region_create` — Allocate and init a region struct +* :c:func:`fpga_region_register` — Register an FPGA region +* :c:func:`fpga_region_unregister` — Unregister an FPGA region + +The FPGA region's probe function will need to get a reference to the FPGA +Manager it will be using to do the programming. This usually would happen +during the region's probe function. + +* :c:func:`fpga_mgr_get` — Get a reference to an FPGA manager, raise ref count +* :c:func:`of_fpga_mgr_get` — Get a reference to an FPGA manager, raise ref count, + given a device node. +* :c:func:`fpga_mgr_put` — Put an FPGA manager + +The FPGA region will need to specify which bridges to control while programming +the FPGA. The region driver can build a list of bridges during probe time +(:c:member:`fpga_region->bridge_list`) or it can have a function that creates +the list of bridges to program just before programming +(:c:member:`fpga_region->get_bridges`). The FPGA bridge framework supplies the +following APIs to handle building or tearing down that list. + +* :c:func:`fpga_bridge_get_to_list` — Get a ref of an FPGA bridge, add it to a + list +* :c:func:`of_fpga_bridge_get_to_list` — Get a ref of an FPGA bridge, add it to a + list, given a device node +* :c:func:`fpga_bridges_put` — Given a list of bridges, put them + .. kernel-doc:: include/linux/fpga/fpga-region.h :functions: fpga_region .. kernel-doc:: drivers/fpga/fpga-region.c - :functions: fpga_region_create - -.. kernel-doc:: drivers/fpga/fpga-region.c - :functions: fpga_region_free + :functions: devm_fpga_region_create .. kernel-doc:: drivers/fpga/fpga-region.c :functions: fpga_region_register .. kernel-doc:: drivers/fpga/fpga-region.c :functions: fpga_region_unregister + +.. kernel-doc:: drivers/fpga/fpga-mgr.c + :functions: fpga_mgr_get + +.. kernel-doc:: drivers/fpga/fpga-mgr.c + :functions: of_fpga_mgr_get + +.. kernel-doc:: drivers/fpga/fpga-mgr.c + :functions: fpga_mgr_put + +.. kernel-doc:: drivers/fpga/fpga-bridge.c + :functions: fpga_bridge_get_to_list + +.. kernel-doc:: drivers/fpga/fpga-bridge.c + :functions: of_fpga_bridge_get_to_list + +.. kernel-doc:: drivers/fpga/fpga-bridge.c + :functions: fpga_bridges_put diff --git a/Documentation/driver-api/fpga/index.rst b/Documentation/driver-api/fpga/index.rst index c51e5ebd544a..31a4773bd2e6 100644 --- a/Documentation/driver-api/fpga/index.rst +++ b/Documentation/driver-api/fpga/index.rst @@ -11,3 +11,5 @@ FPGA Subsystem fpga-mgr fpga-bridge fpga-region + fpga-programming + diff --git a/Documentation/driver-api/fpga/intro.rst b/Documentation/driver-api/fpga/intro.rst index 50d1cab84950..f54c7dabcc7d 100644 --- a/Documentation/driver-api/fpga/intro.rst +++ b/Documentation/driver-api/fpga/intro.rst @@ -44,7 +44,7 @@ FPGA Region ----------- If you are adding a new interface to the FPGA framework, add it on top -of an FPGA region to allow the most reuse of your interface. +of an FPGA region. The FPGA Region framework (fpga-region.c) associates managers and bridges as reconfigurable regions. A region may refer to the whole diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index 2c112553df84..a0f294e2e250 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -193,3 +193,27 @@ And the table can be added to the board code as follows:: The line will be hogged as soon as the gpiochip is created or - in case the chip was created earlier - when the hog table is registered. + +Arrays of pins +-------------- +In addition to requesting pins belonging to a function one by one, a device may +also request an array of pins assigned to the function. The way those pins are +mapped to the device determines if the array qualifies for fast bitmap +processing. If yes, a bitmap is passed over get/set array functions directly +between a caller and a respective .get/set_multiple() callback of a GPIO chip. + +In order to qualify for fast bitmap processing, the array must meet the +following requirements: +- pin hardware number of array member 0 must also be 0, +- pin hardware numbers of consecutive array members which belong to the same + chip as member 0 does must also match their array indexes. + +Otherwise fast bitmap processing path is not used in order to avoid consecutive +pins which belong to the same chip but are not in hardware order being processed +separately. + +If the array applies for fast bitmap processing path, pins which belong to +different chips than member 0 does, as well as those with indexes different from +their hardware pin numbers, are excluded from the fast path, both input and +output. Moreover, open drain and open source pins are excluded from fast bitmap +output processing. diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index aa03f389d41d..5e4d8aa68913 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -109,9 +109,11 @@ For a function using multiple GPIOs all of those can be obtained with one call:: enum gpiod_flags flags) This function returns a struct gpio_descs which contains an array of -descriptors:: +descriptors. It also contains a pointer to a gpiolib private structure which, +if passed back to get/set array functions, may speed up I/O proocessing:: struct gpio_descs { + struct gpio_array *info; unsigned int ndescs; struct gpio_desc *desc[]; } @@ -323,29 +325,37 @@ The following functions get or set the values of an array of GPIOs:: int gpiod_get_array_value(unsigned int array_size, struct gpio_desc **desc_array, - int *value_array); + struct gpio_array *array_info, + unsigned long *value_bitmap); int gpiod_get_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, - int *value_array); + struct gpio_array *array_info, + unsigned long *value_bitmap); int gpiod_get_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, - int *value_array); + struct gpio_array *array_info, + unsigned long *value_bitmap); int gpiod_get_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, - int *value_array); - - void gpiod_set_array_value(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) - void gpiod_set_raw_array_value(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) - void gpiod_set_array_value_cansleep(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) - void gpiod_set_raw_array_value_cansleep(unsigned int array_size, - struct gpio_desc **desc_array, - int *value_array) + struct gpio_array *array_info, + unsigned long *value_bitmap); + + int gpiod_set_array_value(unsigned int array_size, + struct gpio_desc **desc_array, + struct gpio_array *array_info, + unsigned long *value_bitmap) + int gpiod_set_raw_array_value(unsigned int array_size, + struct gpio_desc **desc_array, + struct gpio_array *array_info, + unsigned long *value_bitmap) + int gpiod_set_array_value_cansleep(unsigned int array_size, + struct gpio_desc **desc_array, + struct gpio_array *array_info, + unsigned long *value_bitmap) + int gpiod_set_raw_array_value_cansleep(unsigned int array_size, + struct gpio_desc **desc_array, + struct gpio_array *array_info, + unsigned long *value_bitmap) The array can be an arbitrary set of GPIOs. The functions will try to access GPIOs belonging to the same bank or chip simultaneously if supported by the @@ -356,8 +366,9 @@ accessed sequentially. The functions take three arguments: * array_size - the number of array elements * desc_array - an array of GPIO descriptors - * value_array - an array to store the GPIOs' values (get) or - an array of values to assign to the GPIOs (set) + * array_info - optional information obtained from gpiod_array_get() + * value_bitmap - a bitmap to store the GPIOs' values (get) or + a bitmap of values to assign to the GPIOs (set) The descriptor array can be obtained using the gpiod_get_array() function or one of its variants. If the group of descriptors returned by that function @@ -366,16 +377,25 @@ the struct gpio_descs returned by gpiod_get_array():: struct gpio_descs *my_gpio_descs = gpiod_get_array(...); gpiod_set_array_value(my_gpio_descs->ndescs, my_gpio_descs->desc, - my_gpio_values); + my_gpio_descs->info, my_gpio_value_bitmap); It is also possible to access a completely arbitrary array of descriptors. The descriptors may be obtained using any combination of gpiod_get() and gpiod_get_array(). Afterwards the array of descriptors has to be setup -manually before it can be passed to one of the above functions. +manually before it can be passed to one of the above functions. In that case, +array_info should be set to NULL. Note that for optimal performance GPIOs belonging to the same chip should be contiguous within the array of descriptors. +Still better performance may be achieved if array indexes of the descriptors +match hardware pin numbers of a single chip. If an array passed to a get/set +array function matches the one obtained from gpiod_get_array() and array_info +associated with the array is also passed, the function may take a fast bitmap +processing path, passing the value_bitmap argument directly to the respective +.get/set_multiple() callback of the chip. That allows for utilization of GPIO +banks as data I/O ports without much loss of performance. + The return value of gpiod_get_array_value() and its variants is 0 on success or negative on error. Note the difference to gpiod_get_value(), which returns 0 or 1 on success to convey the GPIO value. With the array functions, the GPIO diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index cbe0242842d1..a6c14ff0c54f 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -374,7 +374,28 @@ When implementing an irqchip inside a GPIO driver, these two functions should typically be called in the .startup() and .shutdown() callbacks from the irqchip. -When using the gpiolib irqchip helpers, these callback are automatically +When using the gpiolib irqchip helpers, these callbacks are automatically +assigned. + + +Disabling and enabling IRQs +--------------------------- +When a GPIO is used as an IRQ signal, then gpiolib also needs to know if +the IRQ is enabled or disabled. In order to inform gpiolib about this, +a driver should call:: + + void gpiochip_disable_irq(struct gpio_chip *chip, unsigned int offset) + +This allows drivers to drive the GPIO as an output while the IRQ is +disabled. When the IRQ is enabled again, a driver should call:: + + void gpiochip_enable_irq(struct gpio_chip *chip, unsigned int offset) + +When implementing an irqchip inside a GPIO driver, these two functions should +typically be called in the .irq_disable() and .irq_enable() callbacks from the +irqchip. + +When using the gpiolib irqchip helpers, these callbacks are automatically assigned. Real-Time compliance for GPIO IRQ chips diff --git a/Documentation/driver-api/gpio/index.rst b/Documentation/driver-api/gpio/index.rst index 6a374ded1287..c5b8467f9104 100644 --- a/Documentation/driver-api/gpio/index.rst +++ b/Documentation/driver-api/gpio/index.rst @@ -38,7 +38,7 @@ Device tree support Device-managed API ================== -.. kernel-doc:: drivers/gpio/devres.c +.. kernel-doc:: drivers/gpio/gpiolib-devres.c :export: sysfs helpers diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 6d9f2f9fe20e..909f991b4c0d 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -29,7 +29,8 @@ available subsections can be seen below. iio/index input usb/index - pci + firewire + pci/index spi i2c hsi diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst index c55a6034c397..55447659b81f 100644 --- a/Documentation/driver-api/mtdnand.rst +++ b/Documentation/driver-api/mtdnand.rst @@ -180,10 +180,10 @@ by a chip select decoder. { struct nand_chip *this = mtd_to_nand(mtd); switch(cmd){ - case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break; - case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break; - case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break; - case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break; + case NAND_CTL_SETCLE: this->legacy.IO_ADDR_W |= CLE_ADRR_BIT; break; + case NAND_CTL_CLRCLE: this->legacy.IO_ADDR_W &= ~CLE_ADRR_BIT; break; + case NAND_CTL_SETALE: this->legacy.IO_ADDR_W |= ALE_ADRR_BIT; break; + case NAND_CTL_CLRALE: this->legacy.IO_ADDR_W &= ~ALE_ADRR_BIT; break; } } @@ -197,7 +197,7 @@ to read back the state of the pin. The function has no arguments and should return 0, if the device is busy (R/B pin is low) and 1, if the device is ready (R/B pin is high). If the hardware interface does not give access to the ready busy pin, then the function must not be defined -and the function pointer this->dev_ready is set to NULL. +and the function pointer this->legacy.dev_ready is set to NULL. Init function ------------- @@ -235,18 +235,18 @@ necessary information about the device. } /* Set address of NAND IO lines */ - this->IO_ADDR_R = baseaddr; - this->IO_ADDR_W = baseaddr; + this->legacy.IO_ADDR_R = baseaddr; + this->legacy.IO_ADDR_W = baseaddr; /* Reference hardware control function */ this->hwcontrol = board_hwcontrol; /* Set command delay time, see datasheet for correct value */ - this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; + this->legacy.chip_delay = CHIP_DEPENDEND_COMMAND_DELAY; /* Assign the device ready function, if available */ - this->dev_ready = board_dev_ready; + this->legacy.dev_ready = board_dev_ready; this->eccmode = NAND_ECC_SOFT; /* Scan to find existence of the device */ - if (nand_scan (board_mtd, 1)) { + if (nand_scan (this, 1)) { err = -ENXIO; goto out_ior; } @@ -277,7 +277,7 @@ unregisters the partitions in the MTD layer. static void __exit board_cleanup (void) { /* Release resources, unregister device */ - nand_release (board_mtd); + nand_release (mtd_to_nand(board_mtd)); /* unmap physical address */ iounmap(baseaddr); @@ -336,17 +336,17 @@ connected to an address decoder. struct nand_chip *this = mtd_to_nand(mtd); /* Deselect all chips */ - this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; - this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; + this->legacy.IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK; + this->legacy.IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK; switch (chip) { case 0: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; + this->legacy.IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0; + this->legacy.IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0; break; .... case n: - this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; - this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; + this->legacy.IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn; + this->legacy.IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn; break; } } diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst new file mode 100644 index 000000000000..c6cf1fef61ce --- /dev/null +++ b/Documentation/driver-api/pci/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================ +The Linux PCI driver implementer's API guide +============================================ + +.. class:: toc-title + + Table of contents + +.. toctree:: + :maxdepth: 2 + + pci + p2pdma + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst new file mode 100644 index 000000000000..4c577fa7bef9 --- /dev/null +++ b/Documentation/driver-api/pci/p2pdma.rst @@ -0,0 +1,145 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +PCI Peer-to-Peer DMA Support +============================ + +The PCI bus has pretty decent support for performing DMA transfers +between two devices on the bus. This type of transaction is henceforth +called Peer-to-Peer (or P2P). However, there are a number of issues that +make P2P transactions tricky to do in a perfectly safe way. + +One of the biggest issues is that PCI doesn't require forwarding +transactions between hierarchy domains, and in PCIe, each Root Port +defines a separate hierarchy domain. To make things worse, there is no +simple way to determine if a given Root Complex supports this or not. +(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel +only supports doing P2P when the endpoints involved are all behind the +same PCI bridge, as such devices are all in the same PCI hierarchy +domain, and the spec guarantees that all transactions within the +hierarchy will be routable, but it does not require routing +between hierarchies. + +The second issue is that to make use of existing interfaces in Linux, +memory that is used for P2P transactions needs to be backed by struct +pages. However, PCI BARs are not typically cache coherent so there are +a few corner case gotchas with these pages so developers need to +be careful about what they do with them. + + +Driver Writer's Guide +===================== + +In a given P2P implementation there may be three or more different +types of kernel drivers in play: + +* Provider - A driver which provides or publishes P2P resources like + memory or doorbell registers to other drivers. +* Client - A driver which makes use of a resource by setting up a + DMA transaction to or from it. +* Orchestrator - A driver which orchestrates the flow of data between + clients and providers. + +In many cases there could be overlap between these three types (i.e., +it may be typical for a driver to be both a provider and a client). + +For example, in the NVMe Target Copy Offload implementation: + +* The NVMe PCI driver is both a client, provider and orchestrator + in that it exposes any CMB (Controller Memory Buffer) as a P2P memory + resource (provider), it accepts P2P memory pages as buffers in requests + to be used directly (client) and it can also make use of the CMB as + submission queue entries (orchastrator). +* The RDMA driver is a client in this arrangement so that an RNIC + can DMA directly to the memory exposed by the NVMe device. +* The NVMe Target driver (nvmet) can orchestrate the data from the RNIC + to the P2P memory (CMB) and then to the NVMe device (and vice versa). + +This is currently the only arrangement supported by the kernel but +one could imagine slight tweaks to this that would allow for the same +functionality. For example, if a specific RNIC added a BAR with some +memory behind it, its driver could add support as a P2P provider and +then the NVMe Target could use the RNIC's memory instead of the CMB +in cases where the NVMe cards in use do not have CMB support. + + +Provider Drivers +---------------- + +A provider simply needs to register a BAR (or a portion of a BAR) +as a P2P DMA resource using :c:func:`pci_p2pdma_add_resource()`. +This will register struct pages for all the specified memory. + +After that it may optionally publish all of its resources as +P2P memory using :c:func:`pci_p2pmem_publish()`. This will allow +any orchestrator drivers to find and use the memory. When marked in +this way, the resource must be regular memory with no side effects. + +For the time being this is fairly rudimentary in that all resources +are typically going to be P2P memory. Future work will likely expand +this to include other types of resources like doorbells. + + +Client Drivers +-------------- + +A client driver typically only has to conditionally change its DMA map +routine to use the mapping function :c:func:`pci_p2pdma_map_sg()` instead +of the usual :c:func:`dma_map_sg()` function. Memory mapped in this +way does not need to be unmapped. + +The client may also, optionally, make use of +:c:func:`is_pci_p2pdma_page()` to determine when to use the P2P mapping +functions and when to use the regular mapping functions. In some +situations, it may be more appropriate to use a flag to indicate a +given request is P2P memory and map appropriately. It is important to +ensure that struct pages that back P2P memory stay out of code that +does not have support for them as other code may treat the pages as +regular memory which may not be appropriate. + + +Orchestrator Drivers +-------------------- + +The first task an orchestrator driver must do is compile a list of +all client devices that will be involved in a given transaction. For +example, the NVMe Target driver creates a list including the namespace +block device and the RNIC in use. If the orchestrator has access to +a specific P2P provider to use it may check compatibility using +:c:func:`pci_p2pdma_distance()` otherwise it may find a memory provider +that's compatible with all clients using :c:func:`pci_p2pmem_find()`. +If more than one provider is supported, the one nearest to all the clients will +be chosen first. If more than one provider is an equal distance away, the +one returned will be chosen at random (it is not an arbitrary but +truely random). This function returns the PCI device to use for the provider +with a reference taken and therefore when it's no longer needed it should be +returned with pci_dev_put(). + +Once a provider is selected, the orchestrator can then use +:c:func:`pci_alloc_p2pmem()` and :c:func:`pci_free_p2pmem()` to +allocate P2P memory from the provider. :c:func:`pci_p2pmem_alloc_sgl()` +and :c:func:`pci_p2pmem_free_sgl()` are convenience functions for +allocating scatter-gather lists with P2P memory. + +Struct Page Caveats +------------------- + +Driver writers should be very careful about not passing these special +struct pages to code that isn't prepared for it. At this time, the kernel +interfaces do not have any checks for ensuring this. This obviously +precludes passing these pages to userspace. + +P2P memory is also technically IO memory but should never have any side +effects behind it. Thus, the order of loads and stores should not be important +and ioreadX(), iowriteX() and friends should not be necessary. +However, as the memory is not cache coherent, if access ever needs to +be protected by a spinlock then :c:func:`mmiowb()` must be used before +unlocking the lock. (See ACQUIRES VS I/O ACCESSES in +Documentation/memory-barriers.txt) + + +P2P DMA Support Library +======================= + +.. kernel-doc:: drivers/pci/p2pdma.c + :export: diff --git a/Documentation/driver-api/pci.rst b/Documentation/driver-api/pci/pci.rst similarity index 100% rename from Documentation/driver-api/pci.rst rename to Documentation/driver-api/pci/pci.rst diff --git a/Documentation/driver-api/soundwire/stream.rst b/Documentation/driver-api/soundwire/stream.rst index 29121aa55fb9..26a6064503fd 100644 --- a/Documentation/driver-api/soundwire/stream.rst +++ b/Documentation/driver-api/soundwire/stream.rst @@ -101,6 +101,34 @@ interface. :: +--------------------+ | | +----------------+ +Example 5: Stereo Stream with L and R channel is rendered by 2 Masters, each +rendering one channel, and is received by two different Slaves, each +receiving one channel. Both Masters and both Slaves are using single port. :: + + +---------------+ Clock Signal +---------------+ + | Master +----------------------------------+ Slave | + | Interface | | Interface | + | 1 | | 1 | + | | Data Signal | | + | L +----------------------------------+ L | + | (Data) | Data Direction | (Data) | + +---------------+ +-----------------------> +---------------+ + + +---------------+ Clock Signal +---------------+ + | Master +----------------------------------+ Slave | + | Interface | | Interface | + | 2 | | 2 | + | | Data Signal | | + | R +----------------------------------+ R | + | (Data) | Data Direction | (Data) | + +---------------+ +-----------------------> +---------------+ + +Note: In multi-link cases like above, to lock, one would acquire a global +lock and then go on locking bus instances. But, in this case the caller +framework(ASoC DPCM) guarantees that stream operations on a card are +always serialized. So, there is no race condition and hence no need for +global lock. + SoundWire Stream Management flow ================================ @@ -174,6 +202,7 @@ per stream. From ASoC DPCM framework, this stream state maybe linked to .startup() operation. .. code-block:: c + int sdw_alloc_stream(char * stream_name); @@ -200,6 +229,7 @@ only be invoked once by respective Master(s) and Slave(s). From ASoC DPCM framework, this stream state is linked to .hw_params() operation. .. code-block:: c + int sdw_stream_add_master(struct sdw_bus * bus, struct sdw_stream_config * stream_config, struct sdw_ports_config * ports_config, @@ -245,6 +275,7 @@ stream. From ASoC DPCM framework, this stream state is linked to .prepare() operation. .. code-block:: c + int sdw_prepare_stream(struct sdw_stream_runtime * stream); @@ -274,6 +305,7 @@ stream. From ASoC DPCM framework, this stream state is linked to .trigger() start operation. .. code-block:: c + int sdw_enable_stream(struct sdw_stream_runtime * stream); SDW_STREAM_DISABLED @@ -301,6 +333,7 @@ per stream. From ASoC DPCM framework, this stream state is linked to .trigger() stop operation. .. code-block:: c + int sdw_disable_stream(struct sdw_stream_runtime * stream); @@ -325,6 +358,7 @@ per stream. From ASoC DPCM framework, this stream state is linked to .trigger() stop operation. .. code-block:: c + int sdw_deprepare_stream(struct sdw_stream_runtime * stream); @@ -349,6 +383,7 @@ all the Master(s) and Slave(s) associated with stream. From ASoC DPCM framework, this stream state is linked to .hw_free() operation. .. code-block:: c + int sdw_stream_remove_master(struct sdw_bus * bus, struct sdw_stream_runtime * stream); int sdw_stream_remove_slave(struct sdw_slave * slave, @@ -361,6 +396,7 @@ stream assigned as part of ALLOCATED state. In .shutdown() the data structure maintaining stream state are freed up. .. code-block:: c + void sdw_release_stream(struct sdw_stream_runtime * stream); Not Supported diff --git a/Documentation/driver-api/uio-howto.rst b/Documentation/driver-api/uio-howto.rst index fb2eb73be4a3..25f50eace28b 100644 --- a/Documentation/driver-api/uio-howto.rst +++ b/Documentation/driver-api/uio-howto.rst @@ -463,8 +463,8 @@ Getting information about your UIO device Information about all UIO devices is available in sysfs. The first thing you should do in your driver is check ``name`` and ``version`` to make -sure your talking to the right device and that its kernel driver has the -version you expect. +sure you're talking to the right device and that its kernel driver has +the version you expect. You should also make sure that the memory mapping you need exists and has the size you expect. diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt index 41df801f9a50..833edb0d0bc4 100644 --- a/Documentation/efi-stub.txt +++ b/Documentation/efi-stub.txt @@ -83,7 +83,18 @@ is passed to bzImage.efi. The "dtb=" option ----------------- -For the ARM and arm64 architectures, we also need to be able to provide a -device tree to the kernel. This is done with the "dtb=" command line option, -and is processed in the same manner as the "initrd=" option that is +For the ARM and arm64 architectures, a device tree must be provided to +the kernel. Normally firmware shall supply the device tree via the +EFI CONFIGURATION TABLE. However, the "dtb=" command line option can +be used to override the firmware supplied device tree, or to supply +one when firmware is unable to. + +Please note: Firmware adds runtime configuration information to the +device tree before booting the kernel. If dtb= is used to override +the device tree, then any runtime data provided by firmware will be +lost. The dtb= option should only be used either as a debug tool, or +as a last resort when a device tree is not provided in the EFI +CONFIGURATION TABLE. + +"dtb=" is processed in the same manner as the "initrd=" option that is described above. diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX deleted file mode 100644 index fe85e7c5907a..000000000000 --- a/Documentation/fb/00-INDEX +++ /dev/null @@ -1,75 +0,0 @@ -Index of files in Documentation/fb. If you think something about frame -buffer devices needs an entry here, needs correction or you've written one -please mail me. - Geert Uytterhoeven - -00-INDEX - - this file. -api.txt - - The frame buffer API between applications and buffer devices. -arkfb.txt - - info on the fbdev driver for ARK Logic chips. -aty128fb.txt - - info on the ATI Rage128 frame buffer driver. -cirrusfb.txt - - info on the driver for Cirrus Logic chipsets. -cmap_xfbdev.txt - - an introduction to fbdev's cmap structures. -deferred_io.txt - - an introduction to deferred IO. -efifb.txt - - info on the EFI platform driver for Intel based Apple computers. -ep93xx-fb.txt - - info on the driver for EP93xx LCD controller. -fbcon.txt - - intro to and usage guide for the framebuffer console (fbcon). -framebuffer.txt - - introduction to frame buffer devices. -gxfb.txt - - info on the framebuffer driver for AMD Geode GX2 based processors. -intel810.txt - - documentation for the Intel 810/815 framebuffer driver. -intelfb.txt - - docs for Intel 830M/845G/852GM/855GM/865G/915G/945G fb driver. -internals.txt - - quick overview of frame buffer device internals. -lxfb.txt - - info on the framebuffer driver for AMD Geode LX based processors. -matroxfb.txt - - info on the Matrox framebuffer driver for Alpha, Intel and PPC. -metronomefb.txt - - info on the driver for the Metronome display controller. -modedb.txt - - info on the video mode database. -pvr2fb.txt - - info on the PowerVR 2 frame buffer driver. -pxafb.txt - - info on the driver for the PXA25x LCD controller. -s3fb.txt - - info on the fbdev driver for S3 Trio/Virge chips. -sa1100fb.txt - - information about the driver for the SA-1100 LCD controller. -sh7760fb.txt - - info on the SH7760/SH7763 integrated LCDC Framebuffer driver. -sisfb.txt - - info on the framebuffer device driver for various SiS chips. -sm501.txt - - info on the framebuffer device driver for sm501 videoframebuffer. -sstfb.txt - - info on the frame buffer driver for 3dfx' Voodoo Graphics boards. -tgafb.txt - - info on the TGA (DECChip 21030) frame buffer driver. -tridentfb.txt - info on the framebuffer driver for some Trident chip based cards. -udlfb.txt - - Driver for DisplayLink USB 2.0 chips. -uvesafb.txt - - info on the userspace VESA (VBE2+ compliant) frame buffer device. -vesafb.txt - - info on the VESA frame buffer device. -viafb.modes - - list of modes for VIA Integration Graphic Chip. -viafb.txt - - info on the VIA Integration Graphic Chip console framebuffer driver. -vt8623fb.txt - - info on the fb driver for the graphics core in VIA VT8623 chipsets. diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt index f6362d88763b..aa924196c366 100644 --- a/Documentation/fb/uvesafb.txt +++ b/Documentation/fb/uvesafb.txt @@ -15,7 +15,8 @@ than x86. Check the v86d documentation for a list of currently supported arches. v86d source code can be downloaded from the following website: - http://dev.gentoo.org/~spock/projects/uvesafb + + https://github.com/mjanusz/v86d Please refer to the v86d documentation for detailed configuration and installation instructions. @@ -177,7 +178,7 @@ from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo. -- Michal Januszewski - Last updated: 2009-03-30 + Last updated: 2017-10-10 Documentation of the uvesafb options is loosely based on vesafb.txt. diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt index 950d5a658cb3..413bb73235be 100644 --- a/Documentation/fb/vesafb.txt +++ b/Documentation/fb/vesafb.txt @@ -114,11 +114,11 @@ to turn it on. You can pass options to vesafb using "video=vesafb:option" on the kernel command line. Multiple options should be separated -by comma, like this: "video=vesafb:ypan,invers" +by comma, like this: "video=vesafb:ypan,inverse" Accepted options: -invers no comment... +inverse use inverse color map ypan enable display panning using the VESA protected mode interface. The visible screen is just a window of the diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX deleted file mode 100644 index 0937bade1099..000000000000 --- a/Documentation/filesystems/00-INDEX +++ /dev/null @@ -1,153 +0,0 @@ -00-INDEX - - this file (info on some of the filesystems supported by linux). -Locking - - info on locking rules as they pertain to Linux VFS. -9p.txt - - 9p (v9fs) is an implementation of the Plan 9 remote fs protocol. -adfs.txt - - info and mount options for the Acorn Advanced Disc Filing System. -afs.txt - - info and examples for the distributed AFS (Andrew File System) fs. -affs.txt - - info and mount options for the Amiga Fast File System. -autofs-mount-control.txt - - info on device control operations for autofs module. -automount-support.txt - - information about filesystem automount support. -befs.txt - - information about the BeOS filesystem for Linux. -bfs.txt - - info for the SCO UnixWare Boot Filesystem (BFS). -btrfs.txt - - info for the BTRFS filesystem. -caching/ - - directory containing filesystem cache documentation. -ceph.txt - - info for the Ceph Distributed File System. -cifs/ - - directory containing CIFS filesystem documentation and example code. -coda.txt - - description of the CODA filesystem. -configfs/ - - directory containing configfs documentation and example code. -cramfs.txt - - info on the cram filesystem for small storage (ROMs etc). -dax.txt - - info on avoiding the page cache for files stored on CPU-addressable - storage devices. -debugfs.txt - - info on the debugfs filesystem. -devpts.txt - - info on the devpts filesystem. -directory-locking - - info about the locking scheme used for directory operations. -dlmfs.txt - - info on the userspace interface to the OCFS2 DLM. -dnotify.txt - - info about directory notification in Linux. -dnotify_test.c - - example program for dnotify. -ecryptfs.txt - - docs on eCryptfs: stacked cryptographic filesystem for Linux. -efivarfs.txt - - info for the efivarfs filesystem. -exofs.txt - - info, usage, mount options, design about EXOFS. -ext2.txt - - info, mount options and specifications for the Ext2 filesystem. -ext3.txt - - info, mount options and specifications for the Ext3 filesystem. -ext4.txt - - info, mount options and specifications for the Ext4 filesystem. -f2fs.txt - - info and mount options for the F2FS filesystem. -fiemap.txt - - info on fiemap ioctl. -files.txt - - info on file management in the Linux kernel. -fuse.txt - - info on the Filesystem in User SpacE including mount options. -gfs2-glocks.txt - - info on the Global File System 2 - Glock internal locking rules. -gfs2-uevents.txt - - info on the Global File System 2 - uevents. -gfs2.txt - - info on the Global File System 2. -hfs.txt - - info on the Macintosh HFS Filesystem for Linux. -hfsplus.txt - - info on the Macintosh HFSPlus Filesystem for Linux. -hpfs.txt - - info and mount options for the OS/2 HPFS. -inotify.txt - - info on the powerful yet simple file change notification system. -isofs.txt - - info and mount options for the ISO 9660 (CDROM) filesystem. -jfs.txt - - info and mount options for the JFS filesystem. -locks.txt - - info on file locking implementations, flock() vs. fcntl(), etc. -mandatory-locking.txt - - info on the Linux implementation of Sys V mandatory file locking. -nfs/ - - nfs-related documentation. -nilfs2.txt - - info and mount options for the NILFS2 filesystem. -ntfs.txt - - info and mount options for the NTFS filesystem (Windows NT). -ocfs2.txt - - info and mount options for the OCFS2 clustered filesystem. -omfs.txt - - info on the Optimized MPEG FileSystem. -path-lookup.txt - - info on path walking and name lookup locking. -pohmelfs/ - - directory containing pohmelfs filesystem documentation. -porting - - various information on filesystem porting. -proc.txt - - info on Linux's /proc filesystem. -qnx6.txt - - info on the QNX6 filesystem. -quota.txt - - info on Quota subsystem. -ramfs-rootfs-initramfs.txt - - info on the 'in memory' filesystems ramfs, rootfs and initramfs. -relay.txt - - info on relay, for efficient streaming from kernel to user space. -romfs.txt - - description of the ROMFS filesystem. -seq_file.txt - - how to use the seq_file API. -sharedsubtree.txt - - a description of shared subtrees for namespaces. -spufs.txt - - info and mount options for the SPU filesystem used on Cell. -squashfs.txt - - info on the squashfs filesystem. -sysfs-pci.txt - - info on accessing PCI device resources through sysfs. -sysfs-tagging.txt - - info on sysfs tagging to avoid duplicates. -sysfs.txt - - info on sysfs, a ram-based filesystem for exporting kernel objects. -sysv-fs.txt - - info on the SystemV/V7/Xenix/Coherent filesystem. -tmpfs.txt - - info on tmpfs, a filesystem that holds all files in virtual memory. -ubifs.txt - - info on the Unsorted Block Images FileSystem. -udf.txt - - info and mount options for the UDF filesystem. -ufs.txt - - info on the ufs filesystem. -vfat.txt - - info on using the VFAT filesystem used in Windows NT and Windows 95. -vfs.txt - - overview of the Virtual File System. -xfs-delayed-logging-design.txt - - info on the XFS Delayed Logging Design. -xfs-self-describing-metadata.txt - - info on XFS Self Describing Metadata. -xfs.txt - - info and mount options for the XFS filesystem. diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 8bf62240e10d..1177052701e1 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -151,6 +151,11 @@ Mount Options Report overall filesystem usage in statfs instead of using the root directory quota. + nocopyfrom + Don't use the RADOS 'copy-from' operation to perform remote object + copies. Currently, it's only used in copy_file_range, which will revert + to the default VFS implementation if this option is used. + More Information ================ diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 70cb68bed2e8..bc393e0a22b8 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -75,7 +75,7 @@ exposure of uninitialized data through mmap. These filesystems may be used for inspiration: - ext2: see Documentation/filesystems/ext2.txt -- ext4: see Documentation/filesystems/ext4.txt +- ext4: see Documentation/filesystems/ext4/ext4.rst - xfs: see Documentation/filesystems/xfs.txt diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt index 81c0becab225..a45c9fc0747b 100644 --- a/Documentation/filesystems/ext2.txt +++ b/Documentation/filesystems/ext2.txt @@ -358,7 +358,7 @@ and are copied into the filesystem. If a transaction is incomplete at the time of the crash, then there is no guarantee of consistency for the blocks in that transaction so they are discarded (which means any filesystem changes they represent are also lost). -Check Documentation/filesystems/ext4.txt if you want to read more about +Check Documentation/filesystems/ext4/ext4.rst if you want to read more about ext4 and journaling. References diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e5edd29687b5..e46c2147ddf8 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -172,9 +172,10 @@ fault_type=%d Support configuring fault injection type, should be FAULT_DIR_DEPTH 0x000000100 FAULT_EVICT_INODE 0x000000200 FAULT_TRUNCATE 0x000000400 - FAULT_IO 0x000000800 + FAULT_READ_IO 0x000000800 FAULT_CHECKPOINT 0x000001000 FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. @@ -211,6 +212,11 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix", non-atomic files likewise "nobarrier" mount option. test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. +checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable" + to reenable checkpointing. Is enabled by default. While + disabled, any unmounting or unexpected shutdowns will cause + the filesystem contents to appear as they did when the + filesystem was mounted with that option. ================================================================================ DEBUGFS ENTRIES diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 48b424de85bb..cfbc18f0d9c9 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -191,21 +191,11 @@ Currently, the following pairs of encryption modes are supported: - AES-256-XTS for contents and AES-256-CTS-CBC for filenames - AES-128-CBC for contents and AES-128-CTS-CBC for filenames -- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames It is strongly recommended to use AES-256-XTS for contents encryption. AES-128-CBC was added only for low-powered embedded devices with crypto accelerators such as CAAM or CESA that do not support XTS. -Similarly, Speck128/256 support was only added for older or low-end -CPUs which cannot do AES fast enough -- especially ARM CPUs which have -NEON instructions but not the Cryptography Extensions -- and for which -it would not otherwise be feasible to use encryption at all. It is -not recommended to use Speck on CPUs that have AES instructions. -Speck support is only available if it has been enabled in the crypto -API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get -acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled. - New encryption modes can be added relatively easily, without changes to individual filesystems. However, authenticated encryption (AE) modes are not currently supported because of the difficulty of dealing diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX deleted file mode 100644 index 53f3b596ac0d..000000000000 --- a/Documentation/filesystems/nfs/00-INDEX +++ /dev/null @@ -1,26 +0,0 @@ -00-INDEX - - this file (nfs-related documentation). -Exporting - - explanation of how to make filesystems exportable. -fault_injection.txt - - information for using fault injection on the server -knfsd-stats.txt - - statistics which the NFS server makes available to user space. -nfs.txt - - nfs client, and DNS resolution for fs_locations. -nfs41-server.txt - - info on the Linux server implementation of NFSv4 minor version 1. -nfs-rdma.txt - - how to install and setup the Linux NFS/RDMA client and server software -nfsd-admin-interfaces.txt - - Administrative interfaces for nfsd. -nfsroot.txt - - short guide on setting up a diskless box with NFS root filesystem. -pnfs.txt - - short explanation of some of the internals of the pnfs client code -rpc-cache.txt - - introduction to the caching mechanisms in the sunrpc layer. -idmapper.txt - - information for configuring request-keys to be used by idmapper -rpc-server-gss.txt - - Information on GSS authentication support in the NFS Server diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt index ebcaaee21616..c4dac829db0f 100644 --- a/Documentation/filesystems/nfs/rpc-cache.txt +++ b/Documentation/filesystems/nfs/rpc-cache.txt @@ -84,7 +84,7 @@ Creating a Cache A message from user space has arrived to fill out a cache entry. It is in 'buf' of length 'len'. cache_parse should parse this, find the item in the - cache with sunrpc_cache_lookup, and update the item + cache with sunrpc_cache_lookup_rcu, and update the item with sunrpc_cache_update. @@ -95,7 +95,7 @@ Creating a Cache Using a cache ------------- -To find a value in a cache, call sunrpc_cache_lookup passing a pointer +To find a value in a cache, call sunrpc_cache_lookup_rcu passing a pointer to the cache_head in a sample item with the 'key' fields filled in. This will be passed to ->match to identify the target entry. If no entry is found, a new entry will be create, added to the cache, and @@ -116,7 +116,7 @@ item does become valid, the deferred copy of the request will be revisited (->revisit). It is expected that this method will reschedule the request for processing. -The value returned by sunrpc_cache_lookup can also be passed to +The value returned by sunrpc_cache_lookup_rcu can also be passed to sunrpc_cache_update to set the content for the item. A second item is passed which should hold the content. If the item found by _lookup has valid data, then it is discarded and a new item is created. This diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 51c136c821bf..eef7d9d259e8 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -286,6 +286,12 @@ pointed by REDIRECT. This should not be possible on local system as setting "trusted." xattrs will require CAP_SYS_ADMIN. But it should be possible for untrusted layers like from a pen drive. +Note: redirect_dir={off|nofollow|follow(*)} conflicts with metacopy=on, and +results in an error. + +(*) redirect_dir=follow only conflicts with metacopy=on if upperdir=... is +given. + Sharing and copying layers -------------------------- diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt deleted file mode 100644 index 106d17fbb05f..000000000000 --- a/Documentation/filesystems/pohmelfs/design_notes.txt +++ /dev/null @@ -1,72 +0,0 @@ -POHMELFS: Parallel Optimized Host Message Exchange Layered File System. - - Evgeniy Polyakov - -Homepage: http://www.ioremap.net/projects/pohmelfs - -POHMELFS first began as a network filesystem with coherent local data and -metadata caches but is now evolving into a parallel distributed filesystem. - -Main features of this FS include: - * Locally coherent cache for data and metadata with (potentially) byte-range locks. - Since all Linux filesystems lock the whole inode during writing, algorithm - is very simple and does not use byte-ranges, although they are sent in - locking messages. - * Completely async processing of all events except creation of hard and symbolic - links, and rename events. - Object creation and data reading and writing are processed asynchronously. - * Flexible object architecture optimized for network processing. - Ability to create long paths to objects and remove arbitrarily huge - directories with a single network command. - (like removing the whole kernel tree via a single network command). - * Very high performance. - * Fast and scalable multithreaded userspace server. Being in userspace it works - with any underlying filesystem and still is much faster than async in-kernel NFS one. - * Client is able to switch between different servers (if one goes down, client - automatically reconnects to second and so on). - * Transactions support. Full failover for all operations. - Resending transactions to different servers on timeout or error. - * Read request (data read, directory listing, lookup requests) balancing between multiple servers. - * Write requests are replicated to multiple servers and completed only when all of them are acked. - * Ability to add and/or remove servers from the working set at run-time. - * Strong authentication and possible data encryption in network channel. - * Extended attributes support. - -POHMELFS is based on transactions, which are potentially long-standing objects that live -in the client's memory. Each transaction contains all the information needed to process a given -command (or set of commands, which is frequently used during data writing: single transactions -can contain creation and data writing commands). Transactions are committed by all the servers -to which they are sent and, in case of failures, are eventually resent or dropped with an error. -For example, reading will return an error if no servers are available. - -POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is -possible to detach replies from requests and, if the command requires data to be received, the -caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different -servers and async threads will pick up replies in parallel, find appropriate transactions in the -system and put the data where it belongs (like the page or inode cache). - -The main feature of POHMELFS is writeback data and the metadata cache. -Only a few non-performance critical operations use the write-through cache and -are synchronous: hard and symbolic link creation, and object rename. Creation, -removal of objects and data writing are asynchronous and are sent to -the server during system writeback. Only one writer at a time is allowed for any -given inode, which is guarded by an appropriate locking protocol. -Because of this feature, POHMELFS is extremely fast at metadata intensive -workloads and can fully utilize the bandwidth to the servers when doing bulk -data transfers. - -POHMELFS clients operate with a working set of servers and are capable of balancing read-only -operations (like lookups or directory listings) between them according to IO priorities. -Administrators can add or remove servers from the set at run-time via special commands (described -in Documentation/filesystems/pohmelfs/info.txt file). Writes are replicated to all servers, which -are connected with write permission turned on. IO priority and permissions can be changed in -run-time. - -POHMELFS is capable of full data channel encryption and/or strong crypto hashing. -One can select any kernel supported cipher, encryption mode, hash type and operation mode -(hmac or digest). It is also possible to use both or neither (default). Crypto configuration -is checked during mount time and, if the server does not support it, appropriate capabilities -will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified). -Crypto performance heavily depends on the number of crypto threads, which asynchronously perform -crypto operations and send the resulting data to server or submit it up the stack. This number -can be controlled via a mount option. diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt deleted file mode 100644 index db2e41393626..000000000000 --- a/Documentation/filesystems/pohmelfs/info.txt +++ /dev/null @@ -1,99 +0,0 @@ -POHMELFS usage information. - -Mount options. -All but index, number of crypto threads and maximum IO size can changed via remount. - -idx=%u - Each mountpoint is associated with a special index via this option. - Administrator can add or remove servers from the given index, so all mounts, - which were attached to it, are updated. - Default it is 0. - -trans_scan_timeout=%u - This timeout, expressed in milliseconds, specifies time to scan transaction - trees looking for stale requests, which have to be resent, or if number of - retries exceed specified limit, dropped with error. - Default is 5 seconds. - -drop_scan_timeout=%u - Internal timeout, expressed in milliseconds, which specifies how frequently - inodes marked to be dropped are freed. It also specifies how frequently - the system checks that servers have to be added or removed from current working set. - Default is 1 second. - -wait_on_page_timeout=%u - Number of milliseconds to wait for reply from remote server for data reading command. - If this timeout is exceeded, reading returns an error. - Default is 5 seconds. - -trans_retries=%u - This is the number of times that a transaction will be resent to a server that did - not answer for the last @trans_scan_timeout milliseconds. - When the number of resends exceeds this limit, the transaction is completed with error. - Default is 5 resends. - -crypto_thread_num=%u - Number of crypto processing threads. Threads are used both for RX and TX traffic. - Default is 2, or no threads if crypto operations are not supported. - -trans_max_pages=%u - Maximum number of pages in a single transaction. This parameter also controls - the number of pages, allocated for crypto processing (each crypto thread has - pool of pages, the number of which is equal to 'trans_max_pages'. - Default is 100 pages. - -crypto_fail_unsupported - If specified, mount will fail if the server does not support requested crypto operations. - By default mount will disable non-matching crypto operations. - -mcache_timeout=%u - Maximum number of milliseconds to wait for the mcache objects to be processed. - Mcache includes locks (given lock should be granted by server), attributes (they should be - fully received in the given timeframe). - Default is 5 seconds. - -Usage examples. - -Add server server1.net:1025 into the working set with index $idx -with appropriate hash algorithm and key file and cipher algorithm, mode and key file: -$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key - -Mount filesystem with given index $idx to /mnt mountpoint. -Client will connect to all servers specified in the working set via previous command: -mount -t pohmel -o idx=$idx q /mnt - -Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw): -$cfg A modify -a server1.net -p 1025 -i $idx -I 1 - -Change IO priority to 123 (node with the highest priority gets read requests). -$cfg A modify -a server1.net -p 1025 -i $idx -P 123 - -One can check currect status of all connections in the mountstats file: -# cat /proc/$PID/mountstats -... -device none mounted on /mnt with fstype pohmel -idx addr(:port) socket_type protocol active priority permissions -0 server1.net:1026 1 6 1 250 1 -0 server2.net:1025 1 6 1 123 3 - -Server installation. - -Creating a server, which listens at port 1025 and 0.0.0.0 address. -Working root directory (note, that server chroots there, so you have to have appropriate permissions) -is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there -are appropriate key files. -Number of working threads is set to 10. - -# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key - - -A 6 - listen on ipv6 address. Default: Disabled. - -r root - path to root directory. Default: /tmp. - -a addr - listen address. Default: 0.0.0.0. - -p port - listen port. Default: 1025. - -w workers - number of workers per connected client. Default: 1. - -K file - hash key size. Default: none. - -k file - cipher key size. Default: none. - -h - this help. - -Number of worker threads specifies how many workers will be created for each client. -Bulk single-client transafers usually are better handled with smaller number (like 1-3). diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt deleted file mode 100644 index c680b4b5353d..000000000000 --- a/Documentation/filesystems/pohmelfs/network_protocol.txt +++ /dev/null @@ -1,227 +0,0 @@ -POHMELFS network protocol. - -Basic structure used in network communication is following command: - -struct netfs_cmd -{ - __u16 cmd; /* Command number */ - __u16 csize; /* Attached crypto information size */ - __u16 cpad; /* Attached padding size */ - __u16 ext; /* External flags */ - __u32 size; /* Size of the attached data */ - __u32 trans; /* Transaction id */ - __u64 id; /* Object ID to operate on. Used for feedback.*/ - __u64 start; /* Start of the object. */ - __u64 iv; /* IV sequence */ - __u8 data[0]; -}; - -Commands can be embedded into transaction command (which in turn has own command), -so one can extend protocol as needed without breaking backward compatibility as long -as old commands are supported. All string lengths include tail 0 byte. - -All commands are transferred over the network in big-endian. CPU endianness is used at the end peers. - -@cmd - command number, which specifies command to be processed. Following - commands are used currently: - - NETFS_READDIR = 1, /* Read directory for given inode number */ - NETFS_READ_PAGE, /* Read data page from the server */ - NETFS_WRITE_PAGE, /* Write data page to the server */ - NETFS_CREATE, /* Create directory entry */ - NETFS_REMOVE, /* Remove directory entry */ - NETFS_LOOKUP, /* Lookup single object */ - NETFS_LINK, /* Create a link */ - NETFS_TRANS, /* Transaction */ - NETFS_OPEN, /* Open intent */ - NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */ - NETFS_PAGE_CACHE, /* Page cache invalidation message */ - NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */ - NETFS_RENAME, /* Rename object */ - NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */ - NETFS_LOCK, /* Distributed lock message */ - NETFS_XATTR_SET, /* Set extended attribute */ - NETFS_XATTR_GET, /* Get extended attribute */ - -@ext - external flags. Used by different commands to specify some extra arguments - like partial size of the embedded objects or creation flags. - -@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached, - but size of the requested data is incorporated here. It does not include size of the command - header (struct netfs_cmd) itself. - -@id - id of the object this command operates on. Each command can use it for own purpose. - -@start - start of the object this command operates on. Each command can use it for own purpose. - -@csize, @cpad - size and padding size of the (attached if needed) crypto information. - -Command specifications. - -@NETFS_READDIR -This command is used to sync content of the remote dir to the client. - -@ext - length of the path to object. -@size - the same. -@id - local inode number of the directory to read. -@start - zero. - - -@NETFS_READ_PAGE -This command is used to read data from remote server. -Data size does not exceed local page cache size. - -@id - inode number. -@start - first byte offset. -@size - number of bytes to read plus length of the path to object. -@ext - object path length. - - -@NETFS_CREATE -Used to create object. -It does not require that all directories on top of the object were -already created, it will create them automatically. Each object has -associated @netfs_path_entry data structure, which contains creation -mode (permissions and type) and length of the name as long as name itself. - -@start - 0 -@size - size of the all data structures needed to create a path -@id - local inode number -@ext - 0 - - -@NETFS_REMOVE -Used to remove object. - -@ext - length of the path to object. -@size - the same. -@id - local inode number. -@start - zero. - - -@NETFS_LOOKUP -Lookup information about object on server. - -@ext - length of the path to object. -@size - the same. -@id - local inode number of the directory to look object in. -@start - local inode number of the object to look at. - - -@NETFS_LINK -Create hard of symlink. -Command is sent as "object_path|target_path". - -@size - size of the above string. -@id - parent local inode number. -@start - 1 for symlink, 0 for hardlink. -@ext - size of the "object_path" above. - - -@NETFS_TRANS -Transaction header. - -@size - incorporates all embedded command sizes including theirs header sizes. -@start - transaction generation number - unique id used to find transaction. -@ext - transaction flags. Unused at the moment. -@id - 0. - - -@NETFS_OPEN -Open intent for given transaction. - -@id - local inode number. -@start - 0. -@size - path length to the object. -@ext - open flags (O_RDWR and so on). - - -@NETFS_INODE_INFO -Metadata update command. -It is sent to servers when attributes of the object are changed and received -when data or metadata were updated. It operates with the following structure: - -struct netfs_inode_info -{ - unsigned int mode; - unsigned int nlink; - unsigned int uid; - unsigned int gid; - unsigned int blocksize; - unsigned int padding; - __u64 ino; - __u64 blocks; - __u64 rdev; - __u64 size; - __u64 version; -}; - -It effectively mirrors stat(2) returned data. - - -@ext - path length to the object. -@size - the same plus size of the netfs_inode_info structure. -@id - local inode number. -@start - 0. - - -@NETFS_PAGE_CACHE -Command is only received by clients. It contains information about -page to be marked as not up-to-date. - -@id - client's inode number. -@start - last byte of the page to be invalidated. If it is not equal to - current inode size, it will be vmtruncated(). -@size - 0 -@ext - 0 - - -@NETFS_READ_PAGES -Used to read multiple contiguous pages in one go. - -@start - first byte of the contiguous region to read. -@size - contains of two fields: lower 8 bits are used to represent page cache shift - used by client, another 3 bytes are used to get number of pages. -@id - local inode number. -@ext - path length to the object. - - -@NETFS_RENAME -Used to rename object. -Attached data is formed into following string: "old_path|new_path". - -@id - local inode number. -@start - parent inode number. -@size - length of the above string. -@ext - length of the old path part. - - -@NETFS_CAPABILITIES -Used to exchange crypto capabilities with server. -If crypto capabilities are not supported by server, then client will disable it -or fail (if 'crypto_fail_unsupported' mount options was specified). - -@id - superblock index. Used to specify crypto information for group of servers. -@size - size of the attached capabilities structure. -@start - 0. -@size - 0. -@scsize - 0. - -@NETFS_LOCK -Used to send lock request/release messages. Although it sends byte range request -and is capable of flushing pages based on that, it is not used, since all Linux -filesystems lock the whole inode. - -@id - lock generation number. -@start - start of the locked range. -@size - size of the locked range. -@ext - lock type: read/write. Not used actually. 15'th bit is used to determine, - if it is lock request (1) or release (0). - -@NETFS_XATTR_SET -@NETFS_XATTR_GET -Used to set/get extended attributes for given inode. -@id - attribute generation number or xattr setting type -@start - size of the attribute (request or attached) -@size - name length, path len and data size for given attribute -@ext - path length for given object diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 7b7b845c490a..cf43bc4dbf31 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -622,3 +622,19 @@ in your dentry operations instead. alloc_file_clone(file, flags, ops) does not affect any caller's references. On success you get a new struct file sharing the mount/dentry with the original, on failure - ERR_PTR(). +-- +[mandatory] + ->clone_file_range() and ->dedupe_file_range have been replaced with + ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + information. +-- +[recommended] + ->lookup() instances doing an equivalent of + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); + don't need to bother with the check - d_splice_alias() will do the + right thing when given ERR_PTR(...) as inode. Moreover, passing NULL + inode to d_splice_alias() will also do the right thing (equivalent of + d_add(dentry, NULL); return NULL;), so that kind of special cases + also doesn't need a separate treatment. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22b4b00dee31..12a5e6e693b6 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -858,6 +858,7 @@ Writeback: 0 kB AnonPages: 861800 kB Mapped: 280372 kB Shmem: 644 kB +KReclaimable: 168048 kB Slab: 284364 kB SReclaimable: 159856 kB SUnreclaim: 124508 kB @@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated with huge pages ShmemPmdMapped: Shared memory mapped into userspace with huge pages +KReclaimable: Kernel allocations that the kernel will attempt to reclaim + under memory pressure. Includes SReclaimable (below), and other + direct allocations with a shrinker. Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md new file mode 100644 index 000000000000..028b3e2e25f9 --- /dev/null +++ b/Documentation/filesystems/ubifs-authentication.md @@ -0,0 +1,426 @@ +% UBIFS Authentication +% sigma star gmbh +% 2018 + +# Introduction + +UBIFS utilizes the fscrypt framework to provide confidentiality for file +contents and file names. This prevents attacks where an attacker is able to +read contents of the filesystem on a single point in time. A classic example +is a lost smartphone where the attacker is unable to read personal data stored +on the device without the filesystem decryption key. + +At the current state, UBIFS encryption however does not prevent attacks where +the attacker is able to modify the filesystem contents and the user uses the +device afterwards. In such a scenario an attacker can modify filesystem +contents arbitrarily without the user noticing. One example is to modify a +binary to perform a malicious action when executed [DMC-CBC-ATTACK]. Since +most of the filesystem metadata of UBIFS is stored in plain, this makes it +fairly easy to swap files and replace their contents. + +Other full disk encryption systems like dm-crypt cover all filesystem metadata, +which makes such kinds of attacks more complicated, but not impossible. +Especially, if the attacker is given access to the device multiple points in +time. For dm-crypt and other filesystems that build upon the Linux block IO +layer, the dm-integrity or dm-verity subsystems [DM-INTEGRITY, DM-VERITY] +can be used to get full data authentication at the block layer. +These can also be combined with dm-crypt [CRYPTSETUP2]. + +This document describes an approach to get file contents _and_ full metadata +authentication for UBIFS. Since UBIFS uses fscrypt for file contents and file +name encryption, the authentication system could be tied into fscrypt such that +existing features like key derivation can be utilized. It should however also +be possible to use UBIFS authentication without using encryption. + + +## MTD, UBI & UBIFS + +On Linux, the MTD (Memory Technology Devices) subsystem provides a uniform +interface to access raw flash devices. One of the more prominent subsystems that +work on top of MTD is UBI (Unsorted Block Images). It provides volume management +for flash devices and is thus somewhat similar to LVM for block devices. In +addition, it deals with flash-specific wear-leveling and transparent I/O error +handling. UBI offers logical erase blocks (LEBs) to the layers on top of it +and maps them transparently to physical erase blocks (PEBs) on the flash. + +UBIFS is a filesystem for raw flash which operates on top of UBI. Thus, wear +leveling and some flash specifics are left to UBI, while UBIFS focuses on +scalability, performance and recoverability. + + + + +------------+ +*******+ +-----------+ +-----+ + | | * UBIFS * | UBI-BLOCK | | ... | + | JFFS/JFFS2 | +*******+ +-----------+ +-----+ + | | +-----------------------------+ +-----------+ +-----+ + | | | UBI | | MTD-BLOCK | | ... | + +------------+ +-----------------------------+ +-----------+ +-----+ + +------------------------------------------------------------------+ + | MEMORY TECHNOLOGY DEVICES (MTD) | + +------------------------------------------------------------------+ + +-----------------------------+ +--------------------------+ +-----+ + | NAND DRIVERS | | NOR DRIVERS | | ... | + +-----------------------------+ +--------------------------+ +-----+ + + Figure 1: Linux kernel subsystems for dealing with raw flash + + + +Internally, UBIFS maintains multiple data structures which are persisted on +the flash: + +- *Index*: an on-flash B+ tree where the leaf nodes contain filesystem data +- *Journal*: an additional data structure to collect FS changes before updating + the on-flash index and reduce flash wear. +- *Tree Node Cache (TNC)*: an in-memory B+ tree that reflects the current FS + state to avoid frequent flash reads. It is basically the in-memory + representation of the index, but contains additional attributes. +- *LEB property tree (LPT)*: an on-flash B+ tree for free space accounting per + UBI LEB. + +In the remainder of this section we will cover the on-flash UBIFS data +structures in more detail. The TNC is of less importance here since it is never +persisted onto the flash directly. More details on UBIFS can also be found in +[UBIFS-WP]. + + +### UBIFS Index & Tree Node Cache + +Basic on-flash UBIFS entities are called *nodes*. UBIFS knows different types +of nodes. Eg. data nodes (`struct ubifs_data_node`) which store chunks of file +contents or inode nodes (`struct ubifs_ino_node`) which represent VFS inodes. +Almost all types of nodes share a common header (`ubifs_ch`) containing basic +information like node type, node length, a sequence number, etc. (see +`fs/ubifs/ubifs-media.h`in kernel source). Exceptions are entries of the LPT +and some less important node types like padding nodes which are used to pad +unusable content at the end of LEBs. + +To avoid re-writing the whole B+ tree on every single change, it is implemented +as *wandering tree*, where only the changed nodes are re-written and previous +versions of them are obsoleted without erasing them right away. As a result, +the index is not stored in a single place on the flash, but *wanders* around +and there are obsolete parts on the flash as long as the LEB containing them is +not reused by UBIFS. To find the most recent version of the index, UBIFS stores +a special node called *master node* into UBI LEB 1 which always points to the +most recent root node of the UBIFS index. For recoverability, the master node +is additionally duplicated to LEB 2. Mounting UBIFS is thus a simple read of +LEB 1 and 2 to get the current master node and from there get the location of +the most recent on-flash index. + +The TNC is the in-memory representation of the on-flash index. It contains some +additional runtime attributes per node which are not persisted. One of these is +a dirty-flag which marks nodes that have to be persisted the next time the +index is written onto the flash. The TNC acts as a write-back cache and all +modifications of the on-flash index are done through the TNC. Like other caches, +the TNC does not have to mirror the full index into memory, but reads parts of +it from flash whenever needed. A *commit* is the UBIFS operation of updating the +on-flash filesystem structures like the index. On every commit, the TNC nodes +marked as dirty are written to the flash to update the persisted index. + + +### Journal + +To avoid wearing out the flash, the index is only persisted (*commited*) when +certain conditions are met (eg. `fsync(2)`). The journal is used to record +any changes (in form of inode nodes, data nodes etc.) between commits +of the index. During mount, the journal is read from the flash and replayed +onto the TNC (which will be created on-demand from the on-flash index). + +UBIFS reserves a bunch of LEBs just for the journal called *log area*. The +amount of log area LEBs is configured on filesystem creation (using +`mkfs.ubifs`) and stored in the superblock node. The log area contains only +two types of nodes: *reference nodes* and *commit start nodes*. A commit start +node is written whenever an index commit is performed. Reference nodes are +written on every journal update. Each reference node points to the position of +other nodes (inode nodes, data nodes etc.) on the flash that are part of this +journal entry. These nodes are called *buds* and describe the actual filesystem +changes including their data. + +The log area is maintained as a ring. Whenever the journal is almost full, +a commit is initiated. This also writes a commit start node so that during +mount, UBIFS will seek for the most recent commit start node and just replay +every reference node after that. Every reference node before the commit start +node will be ignored as they are already part of the on-flash index. + +When writing a journal entry, UBIFS first ensures that enough space is +available to write the reference node and buds part of this entry. Then, the +reference node is written and afterwards the buds describing the file changes. +On replay, UBIFS will record every reference node and inspect the location of +the referenced LEBs to discover the buds. If these are corrupt or missing, +UBIFS will attempt to recover them by re-reading the LEB. This is however only +done for the last referenced LEB of the journal. Only this can become corrupt +because of a power cut. If the recovery fails, UBIFS will not mount. An error +for every other LEB will directly cause UBIFS to fail the mount operation. + + + | ---- LOG AREA ---- | ---------- MAIN AREA ------------ | + + -----+------+-----+--------+---- ------+-----+-----+--------------- + \ | | | | / / | | | \ + / CS | REF | REF | | \ \ DENT | INO | INO | / + \ | | | | / / | | | \ + ----+------+-----+--------+--- -------+-----+-----+---------------- + | | ^ ^ + | | | | + +------------------------+ | + | | + +-------------------------------+ + + + Figure 2: UBIFS flash layout of log area with commit start nodes + (CS) and reference nodes (REF) pointing to main area + containing their buds + + +### LEB Property Tree/Table + +The LEB property tree is used to store per-LEB information. This includes the +LEB type and amount of free and *dirty* (old, obsolete content) space [1] on +the LEB. The type is important, because UBIFS never mixes index nodes with data +nodes on a single LEB and thus each LEB has a specific purpose. This again is +useful for free space calculations. See [UBIFS-WP] for more details. + +The LEB property tree again is a B+ tree, but it is much smaller than the +index. Due to its smaller size it is always written as one chunk on every +commit. Thus, saving the LPT is an atomic operation. + + +[1] Since LEBs can only be appended and never overwritten, there is a +difference between free space ie. the remaining space left on the LEB to be +written to without erasing it and previously written content that is obsolete +but can't be overwritten without erasing the full LEB. + + +# UBIFS Authentication + +This chapter introduces UBIFS authentication which enables UBIFS to verify +the authenticity and integrity of metadata and file contents stored on flash. + + +## Threat Model + +UBIFS authentication enables detection of offline data modification. While it +does not prevent it, it enables (trusted) code to check the integrity and +authenticity of on-flash file contents and filesystem metadata. This covers +attacks where file contents are swapped. + +UBIFS authentication will not protect against rollback of full flash contents. +Ie. an attacker can still dump the flash and restore it at a later time without +detection. It will also not protect against partial rollback of individual +index commits. That means that an attacker is able to partially undo changes. +This is possible because UBIFS does not immediately overwrites obsolete +versions of the index tree or the journal, but instead marks them as obsolete +and garbage collection erases them at a later time. An attacker can use this by +erasing parts of the current tree and restoring old versions that are still on +the flash and have not yet been erased. This is possible, because every commit +will always write a new version of the index root node and the master node +without overwriting the previous version. This is further helped by the +wear-leveling operations of UBI which copies contents from one physical +eraseblock to another and does not atomically erase the first eraseblock. + +UBIFS authentication does not cover attacks where an attacker is able to +execute code on the device after the authentication key was provided. +Additional measures like secure boot and trusted boot have to be taken to +ensure that only trusted code is executed on a device. + + +## Authentication + +To be able to fully trust data read from flash, all UBIFS data structures +stored on flash are authenticated. That is: + +- The index which includes file contents, file metadata like extended + attributes, file length etc. +- The journal which also contains file contents and metadata by recording changes + to the filesystem +- The LPT which stores UBI LEB metadata which UBIFS uses for free space accounting + + +### Index Authentication + +Through UBIFS' concept of a wandering tree, it already takes care of only +updating and persisting changed parts from leaf node up to the root node +of the full B+ tree. This enables us to augment the index nodes of the tree +with a hash over each node's child nodes. As a result, the index basically also +a Merkle tree. Since the leaf nodes of the index contain the actual filesystem +data, the hashes of their parent index nodes thus cover all the file contents +and file metadata. When a file changes, the UBIFS index is updated accordingly +from the leaf nodes up to the root node including the master node. This process +can be hooked to recompute the hash only for each changed node at the same time. +Whenever a file is read, UBIFS can verify the hashes from each leaf node up to +the root node to ensure the node's integrity. + +To ensure the authenticity of the whole index, the UBIFS master node stores a +keyed hash (HMAC) over its own contents and a hash of the root node of the index +tree. As mentioned above, the master node is always written to the flash whenever +the index is persisted (ie. on index commit). + +Using this approach only UBIFS index nodes and the master node are changed to +include a hash. All other types of nodes will remain unchanged. This reduces +the storage overhead which is precious for users of UBIFS (ie. embedded +devices). + + + +---------------+ + | Master Node | + | (hash) | + +---------------+ + | + v + +-------------------+ + | Index Node #1 | + | | + | branch0 branchn | + | (hash) (hash) | + +-------------------+ + | ... | (fanout: 8) + | | + +-------+ +------+ + | | + v v + +-------------------+ +-------------------+ + | Index Node #2 | | Index Node #3 | + | | | | + | branch0 branchn | | branch0 branchn | + | (hash) (hash) | | (hash) (hash) | + +-------------------+ +-------------------+ + | ... | ... | + v v v + +-----------+ +----------+ +-----------+ + | Data Node | | INO Node | | DENT Node | + +-----------+ +----------+ +-----------+ + + + Figure 3: Coverage areas of index node hash and master node HMAC + + + +The most important part for robustness and power-cut safety is to atomically +persist the hash and file contents. Here the existing UBIFS logic for how +changed nodes are persisted is already designed for this purpose such that +UBIFS can safely recover if a power-cut occurs while persisting. Adding +hashes to index nodes does not change this since each hash will be persisted +atomically together with its respective node. + + +### Journal Authentication + +The journal is authenticated too. Since the journal is continuously written +it is necessary to also add authentication information frequently to the +journal so that in case of a powercut not too much data can't be authenticated. +This is done by creating a continuous hash beginning from the commit start node +over the previous reference nodes, the current reference node, and the bud +nodes. From time to time whenever it is suitable authentication nodes are added +between the bud nodes. This new node type contains a HMAC over the current state +of the hash chain. That way a journal can be authenticated up to the last +authentication node. The tail of the journal which may not have a authentication +node cannot be authenticated and is skipped during journal replay. + +We get this picture for journal authentication: + + ,,,,,,,, + ,......,........................................... + ,. CS , hash1.----. hash2.----. + ,. | , . |hmac . |hmac + ,. v , . v . v + ,.REF#0,-> bud -> bud -> bud.-> auth -> bud -> bud.-> auth ... + ,..|...,........................................... + , | , + , | ,,,,,,,,,,,,,,, + . | hash3,----. + , | , |hmac + , v , v + , REF#1 -> bud -> bud,-> auth ... + ,,,|,,,,,,,,,,,,,,,,,, + v + REF#2 -> ... + | + V + ... + +Since the hash also includes the reference nodes an attacker cannot reorder or +skip any journal heads for replay. An attacker can only remove bud nodes or +reference nodes from the end of the journal, effectively rewinding the +filesystem at maximum back to the last commit. + +The location of the log area is stored in the master node. Since the master +node is authenticated with a HMAC as described above, it is not possible to +tamper with that without detection. The size of the log area is specified when +the filesystem is created using `mkfs.ubifs` and stored in the superblock node. +To avoid tampering with this and other values stored there, a HMAC is added to +the superblock struct. The superblock node is stored in LEB 0 and is only +modified on feature flag or similar changes, but never on file changes. + + +### LPT Authentication + +The location of the LPT root node on the flash is stored in the UBIFS master +node. Since the LPT is written and read atomically on every commit, there is +no need to authenticate individual nodes of the tree. It suffices to +protect the integrity of the full LPT by a simple hash stored in the master +node. Since the master node itself is authenticated, the LPTs authenticity can +be verified by verifying the authenticity of the master node and comparing the +LTP hash stored there with the hash computed from the read on-flash LPT. + + +## Key Management + +For simplicity, UBIFS authentication uses a single key to compute the HMACs +of superblock, master, commit start and reference nodes. This key has to be +available on creation of the filesystem (`mkfs.ubifs`) to authenticate the +superblock node. Further, it has to be available on mount of the filesystem +to verify authenticated nodes and generate new HMACs for changes. + +UBIFS authentication is intended to operate side-by-side with UBIFS encryption +(fscrypt) to provide confidentiality and authenticity. Since UBIFS encryption +has a different approach of encryption policies per directory, there can be +multiple fscrypt master keys and there might be folders without encryption. +UBIFS authentication on the other hand has an all-or-nothing approach in the +sense that it either authenticates everything of the filesystem or nothing. +Because of this and because UBIFS authentication should also be usable without +encryption, it does not share the same master key with fscrypt, but manages +a dedicated authentication key. + +The API for providing the authentication key has yet to be defined, but the +key can eg. be provided by userspace through a keyring similar to the way it +is currently done in fscrypt. It should however be noted that the current +fscrypt approach has shown its flaws and the userspace API will eventually +change [FSCRYPT-POLICY2]. + +Nevertheless, it will be possible for a user to provide a single passphrase +or key in userspace that covers UBIFS authentication and encryption. This can +be solved by the corresponding userspace tools which derive a second key for +authentication in addition to the derived fscrypt master key used for +encryption. + +To be able to check if the proper key is available on mount, the UBIFS +superblock node will additionally store a hash of the authentication key. This +approach is similar to the approach proposed for fscrypt encryption policy v2 +[FSCRYPT-POLICY2]. + + +# Future Extensions + +In certain cases where a vendor wants to provide an authenticated filesystem +image to customers, it should be possible to do so without sharing the secret +UBIFS authentication key. Instead, in addition the each HMAC a digital +signature could be stored where the vendor shares the public key alongside the +filesystem image. In case this filesystem has to be modified afterwards, +UBIFS can exchange all digital signatures with HMACs on first mount similar +to the way the IMA/EVM subsystem deals with such situations. The HMAC key +will then have to be provided beforehand in the normal way. + + +# References + +[CRYPTSETUP2] http://www.saout.de/pipermail/dm-crypt/2017-November/005745.html + +[DMC-CBC-ATTACK] http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/ + +[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt + +[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.txt + +[FSCRYPT-POLICY2] https://www.spinics.net/lists/linux-ext4/msg58710.html + +[UBIFS-WP] http://www.linux-mtd.infradead.org/doc/ubifs_whitepaper.pdf diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index a0a61d2f389f..acc80442a3bb 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -91,6 +91,13 @@ chk_data_crc do not skip checking CRCs on data nodes compr=none override default compressor and set it to "none" compr=lzo override default compressor and set it to "lzo" compr=zlib override default compressor and set it to "zlib" +auth_key= specify the key used for authenticating the filesystem. + Passing this option makes authentication mandatory. + The passed key must be present in the kernel keyring + and must be of type 'logon' +auth_hash_name= The hash algorithm used for authentication. Used for + both hashing and for creating HMACs. Typical values + include "sha256" or "sha512" Quick usage instructions diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index a6c6a8af48a2..5f71a252e2e0 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,8 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; @@ -960,11 +961,18 @@ otherwise noted. copy_file_range: called by the copy_file_range(2) system call. - clone_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE commands. - - dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE - command. + remap_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source file into + the dest file at pos_out. Implementations must handle callers passing + in len == 0; this means "remap to the end of the source file". The + return value should the number of bytes remapped, or the usual + negative error code if errors occurred before any bytes were remapped. + The remap_flags parameter accepts REMAP_FILE_* flags. If + REMAP_FILE_DEDUP is set then the implementation must only remap if the + requested file ranges have identical contents. If REMAP_CAN_SHORTEN is + set, the caller is ok with the implementation shortening the request + length to satisfy alignment or EOF requirements (or any other reason). fadvise: possibly called by the fadvise64() system call. diff --git a/Documentation/fmc/00-INDEX b/Documentation/fmc/00-INDEX deleted file mode 100644 index 431c69570f43..000000000000 --- a/Documentation/fmc/00-INDEX +++ /dev/null @@ -1,38 +0,0 @@ - -Documentation in this directory comes from sections of the manual we -wrote for the externally-developed fmc-bus package. The complete -manual as of today (2013-02) is available in PDF format at -http://www.ohwr.org/projects/fmc-bus/files - -00-INDEX - - this file. - -FMC-and-SDB.txt - - What are FMC and SDB, basic concepts for this framework - -API.txt - - The functions that are exported by the bus driver - -parameters.txt - - The module parameters - -carrier.txt - - writing a carrier (a device) - -mezzanine.txt - - writing code for your mezzanine (a driver) - -identifiers.txt - - how identification and matching works - -fmc-fakedev.txt - - about drivers/fmc/fmc-fakedev.ko - -fmc-trivial.txt - - about drivers/fmc/fmc-trivial.ko - -fmc-write-eeprom.txt - - about drivers/fmc/fmc-write-eeprom.ko - -fmc-chardev.txt - - about drivers/fmc/fmc-chardev.ko diff --git a/Documentation/gpio/00-INDEX b/Documentation/gpio/00-INDEX deleted file mode 100644 index 17e19a68058f..000000000000 --- a/Documentation/gpio/00-INDEX +++ /dev/null @@ -1,4 +0,0 @@ -00-INDEX - - This file -sysfs.txt - - Information about the GPIO sysfs interface diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst index 65be325bf282..7d2d3875ff1a 100644 --- a/Documentation/gpu/drivers.rst +++ b/Documentation/gpu/drivers.rst @@ -13,6 +13,7 @@ GPU Driver Documentation tve200 v3d vc4 + vkms bridge/dw-hdmi xen-front diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst index 5dee6b8a4c12..4b1501b4835b 100644 --- a/Documentation/gpu/drm-kms.rst +++ b/Documentation/gpu/drm-kms.rst @@ -287,8 +287,14 @@ Atomic Mode Setting Function Reference .. kernel-doc:: drivers/gpu/drm/drm_atomic.c :export: -.. kernel-doc:: drivers/gpu/drm/drm_atomic.c - :internal: +Atomic Mode Setting IOCTL and UAPI Functions +-------------------------------------------- + +.. kernel-doc:: drivers/gpu/drm/drm_atomic_uapi.c + :doc: overview + +.. kernel-doc:: drivers/gpu/drm/drm_atomic_uapi.c + :export: CRTC Abstraction ================ @@ -323,6 +329,12 @@ Frame Buffer Functions Reference DRM Format Handling =================== +.. kernel-doc:: include/uapi/drm/drm_fourcc.h + :doc: overview + +Format Functions Reference +-------------------------- + .. kernel-doc:: include/drm/drm_fourcc.h :internal: @@ -560,7 +572,7 @@ Tile Group Property Explicit Fencing Properties --------------------------- -.. kernel-doc:: drivers/gpu/drm/drm_atomic.c +.. kernel-doc:: drivers/gpu/drm/drm_atomic_uapi.c :doc: explicit fencing properties Existing KMS Properties diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst index 21b6b72a9ba8..e725e8449e72 100644 --- a/Documentation/gpu/drm-mm.rst +++ b/Documentation/gpu/drm-mm.rst @@ -297,7 +297,7 @@ made up of several fields, the more interesting ones being: struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); - int (*fault)(struct vm_fault *vmf); + vm_fault_t (*fault)(struct vm_fault *vmf); }; @@ -505,7 +505,7 @@ GPU Scheduler Overview -------- -.. kernel-doc:: drivers/gpu/drm/scheduler/gpu_scheduler.c +.. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c :doc: Overview Scheduler Function References @@ -514,5 +514,5 @@ Scheduler Function References .. kernel-doc:: include/drm/gpu_scheduler.h :internal: -.. kernel-doc:: drivers/gpu/drm/scheduler/gpu_scheduler.c +.. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c :export: diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index a7c150d6b63f..77c2b3c25565 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -127,7 +127,8 @@ interfaces to fix these issues: the acquire context explicitly on stack and then also pass it down into drivers explicitly so that the legacy-on-atomic functions can use them. - Except for some driver code this is done. + Except for some driver code this is done. This task should be finished by + adding WARN_ON(!drm_drv_uses_atomic_modeset) in drm_modeset_lock_all(). * A bunch of the vtable hooks are now in the wrong place: DRM has a split between core vfunc tables (named ``drm_foo_funcs``), which are used to @@ -137,13 +138,6 @@ interfaces to fix these issues: ``_helper_funcs`` since they are not part of the core ABI. There's a ``FIXME`` comment in the kerneldoc for each such case in ``drm_crtc.h``. -* There's a new helper ``drm_atomic_helper_best_encoder()`` which could be - used by all atomic drivers which don't select the encoder for a given - connector at runtime. That's almost all of them, and would allow us to get - rid of a lot of ``best_encoder`` boilerplate in drivers. - - This was almost done, but new drivers added a few more cases again. - Contact: Daniel Vetter Get rid of dev->struct_mutex from GEM drivers @@ -164,9 +158,8 @@ private lock. The tricky part is the BO free functions, since those can't reliably take that lock any more. Instead state needs to be protected with suitable subordinate locks or some cleanup work pushed to a worker thread. For performance-critical drivers it might also be better to go with a more -fine-grained per-buffer object and per-context lockings scheme. Currently the -following drivers still use ``struct_mutex``: ``msm``, ``omapdrm`` and -``udl``. +fine-grained per-buffer object and per-context lockings scheme. Currently only the +``msm`` driver still use ``struct_mutex``. Contact: Daniel Vetter, respective driver maintainers @@ -190,7 +183,8 @@ Convert drivers to use simple modeset suspend/resume Most drivers (except i915 and nouveau) that use drm_atomic_helper_suspend/resume() can probably be converted to use -drm_mode_config_helper_suspend/resume(). +drm_mode_config_helper_suspend/resume(). Also there's still open-coded version +of the atomic suspend/resume code in older atomic modeset drivers. Contact: Maintainer of the driver you plan to convert @@ -246,20 +240,10 @@ Core refactorings Clean up the DRM header mess ---------------------------- -Currently the DRM subsystem has only one global header, ``drmP.h``. This is -used both for functions exported to helper libraries and drivers and functions -only used internally in the ``drm.ko`` module. The goal would be to move all -header declarations not needed outside of ``drm.ko`` into -``drivers/gpu/drm/drm_*_internal.h`` header files. ``EXPORT_SYMBOL`` also -needs to be dropped for these functions. - -This would nicely tie in with the below task to create kerneldoc after the API -is cleaned up. Or with the "hide legacy cruft better" task. - -Note that this is well in progress, but ``drmP.h`` is still huge. The updated -plan is to switch to per-file driver API headers, which will also structure -the kerneldoc better. This should also allow more fine-grained ``#include`` -directives. +The DRM subsystem originally had only one huge global header, ``drmP.h``. This +is now split up, but many source files still include it. The remaining part of +the cleanup work here is to replace any ``#include `` by only the +headers needed (and fixing up any missing pre-declarations in the headers). In the end no .c file should need to include ``drmP.h`` anymore. @@ -278,26 +262,6 @@ See https://dri.freedesktop.org/docs/drm/ for what's there already. Contact: Daniel Vetter -Hide legacy cruft better ------------------------- - -Way back DRM supported only drivers which shadow-attached to PCI devices with -userspace or fbdev drivers setting up outputs. Modern DRM drivers take charge -of the entire device, you can spot them with the DRIVER_MODESET flag. - -Unfortunately there's still large piles of legacy code around which needs to -be hidden so that driver writers don't accidentally end up using it. And to -prevent security issues in those legacy IOCTLs from being exploited on modern -drivers. This has multiple possible subtasks: - -* Extract support code for legacy features into a ``drm-legacy.ko`` kernel - module and compile it only when one of the legacy drivers is enabled. - -This is mostly done, the only thing left is to split up ``drm_irq.c`` into -legacy cruft and the parts needed by modern KMS drivers. - -Contact: Daniel Vetter - Make panic handling work ------------------------ @@ -396,17 +360,12 @@ converting things over. For modeset tests we also first need a bit of infrastructure to use dumb buffers for untiled buffers, to be able to run all the non-i915 specific modeset tests. -Contact: Daniel Vetter - -Create a virtual KMS driver for testing (vkms) ----------------------------------------------- - -With all the latest helpers it should be fairly simple to create a virtual KMS -driver useful for testing, or for running X or similar on headless machines -(to be able to still use the GPU). This would be similar to vgem, but aimed at -the modeset side. +Extend virtual test driver (VKMS) +--------------------------------- -Once the basics are there there's tons of possibilities to extend it. +See the documentation of :ref:`VKMS ` for more details. This is an ideal +internship task, since it only requires a virtual machine and can be sized to +fit the available time. Contact: Daniel Vetter diff --git a/Documentation/gpu/vkms.rst b/Documentation/gpu/vkms.rst new file mode 100644 index 000000000000..0a6ea6216e41 --- /dev/null +++ b/Documentation/gpu/vkms.rst @@ -0,0 +1,24 @@ +.. _vkms: + +========================================== + drm/vkms Virtual Kernel Modesetting +========================================== + +.. kernel-doc:: drivers/gpu/drm/vkms/vkms_drv.c + :doc: vkms (Virtual Kernel Modesetting) + +TODO +==== + +CRC API +------- + +- Optimize CRC computation ``compute_crc()`` and plane blending ``blend()`` + +- Use the alpha value to blend vaddr_src with vaddr_dst instead of + overwriting it in ``blend()``. + +- Add igt test to check cleared alpha value for XRGB plane format. + +- Add igt test to check extreme alpha values i.e. fully opaque and fully + transparent (intermediate values are affected by hw-specific rounding modes). diff --git a/Documentation/hwmon/ina3221 b/Documentation/hwmon/ina3221 index 0ff74854cb2e..4b82cbfb551c 100644 --- a/Documentation/hwmon/ina3221 +++ b/Documentation/hwmon/ina3221 @@ -21,6 +21,8 @@ and power are calculated host-side from these. Sysfs entries ------------- +in[123]_label Voltage channel labels +in[123]_enable Voltage channel enable controls in[123]_input Bus voltage(mV) channels curr[123]_input Current(mA) measurement channels shunt[123]_resistor Shunt resistance(uOhm) channels diff --git a/Documentation/hwmon/lm75 b/Documentation/hwmon/lm75 index ac95edfcd907..2f1120f88c16 100644 --- a/Documentation/hwmon/lm75 +++ b/Documentation/hwmon/lm75 @@ -17,8 +17,8 @@ Supported chips: Addresses scanned: none Datasheet: Publicly available at the Maxim website http://www.maximintegrated.com/ - * Maxim MAX6625, MAX6626 - Prefixes: 'max6625', 'max6626' + * Maxim MAX6625, MAX6626, MAX31725, MAX31726 + Prefixes: 'max6625', 'max6626', 'max31725', 'max31726' Addresses scanned: none Datasheet: Publicly available at the Maxim website http://www.maxim-ic.com/ @@ -86,7 +86,7 @@ The LM75 is essentially an industry standard; there may be other LM75 clones not listed here, with or without various enhancements, that are supported. The clones are not detected by the driver, unless they reproduce the exact register tricks of the original LM75, and must -therefore be instantiated explicitly. Higher resolution up to 12-bit +therefore be instantiated explicitly. Higher resolution up to 16-bit is supported by this driver, other specific enhancements are not. The LM77 is not supported, contrary to what we pretended for a long time. diff --git a/Documentation/hwmon/ltc2978 b/Documentation/hwmon/ltc2978 index 9a49d3c90cd1..dfb2caa401d9 100644 --- a/Documentation/hwmon/ltc2978 +++ b/Documentation/hwmon/ltc2978 @@ -55,6 +55,10 @@ Supported chips: Prefix: 'ltm4676' Addresses scanned: - Datasheet: http://www.linear.com/product/ltm4676 + * Analog Devices LTM4686 + Prefix: 'ltm4686' + Addresses scanned: - + Datasheet: http://www.analog.com/ltm4686 Author: Guenter Roeck @@ -76,6 +80,7 @@ additional components on a single die. The chip is instantiated and reported as two separate chips on two different I2C bus addresses. LTM4675 is a dual 9A or single 18A μModule regulator LTM4676 is a dual 13A or single 26A uModule regulator. +LTM4686 is a dual 10A or single 20A uModule regulator. Usage Notes diff --git a/Documentation/hwmon/mc13783-adc b/Documentation/hwmon/mc13783-adc index d0e7b3fa9e75..05ccc9f159f1 100644 --- a/Documentation/hwmon/mc13783-adc +++ b/Documentation/hwmon/mc13783-adc @@ -2,12 +2,12 @@ Kernel driver mc13783-adc ========================= Supported chips: - * Freescale Atlas MC13783 + * Freescale MC13783 Prefix: 'mc13783' - Datasheet: http://www.freescale.com/files/rf_if/doc/data_sheet/MC13783.pdf?fsrch=1 - * Freescale Atlas MC13892 + Datasheet: https://www.nxp.com/docs/en/data-sheet/MC13783.pdf + * Freescale MC13892 Prefix: 'mc13892' - Datasheet: http://cache.freescale.com/files/analog/doc/data_sheet/MC13892.pdf?fsrch=1&sr=1 + Datasheet: https://www.nxp.com/docs/en/data-sheet/MC13892.pdf Authors: Sascha Hauer diff --git a/Documentation/i2c/busses/i2c-nvidia-gpu b/Documentation/i2c/busses/i2c-nvidia-gpu new file mode 100644 index 000000000000..31884d2b2eb5 --- /dev/null +++ b/Documentation/i2c/busses/i2c-nvidia-gpu @@ -0,0 +1,18 @@ +Kernel driver i2c-nvidia-gpu + +Datasheet: not publicly available. + +Authors: + Ajay Gupta + +Description +----------- + +i2c-nvidia-gpu is a driver for I2C controller included in NVIDIA Turing +and later GPUs and it is used to communicate with Type-C controller on GPUs. + +If your 'lspci -v' listing shows something like the following, + +01:00.3 Serial bus controller [0c80]: NVIDIA Corporation Device 1ad9 (rev a1) + +then this driver should support the I2C controller of your GPU. diff --git a/Documentation/ide/00-INDEX b/Documentation/ide/00-INDEX deleted file mode 100644 index 22f98ca79539..000000000000 --- a/Documentation/ide/00-INDEX +++ /dev/null @@ -1,14 +0,0 @@ -00-INDEX - - this file -ChangeLog.ide-cd.1994-2004 - - ide-cd changelog -ChangeLog.ide-floppy.1996-2002 - - ide-floppy changelog -ChangeLog.ide-tape.1995-2002 - - ide-tape changelog -ide-tape.txt - - info on the IDE ATAPI streaming tape driver -ide.txt - - important info for users of ATA devices (IDE/EIDE disks and CD-ROMS). -warm-plug-howto.txt - - using sysfs to remove and add IDE devices. \ No newline at end of file diff --git a/Documentation/index.rst b/Documentation/index.rst index 5db7e87c7cb1..c858c2e66e36 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -22,10 +22,7 @@ The following describes the license of the Linux kernel source code (GPLv2), how to properly mark the license of individual files in the source tree, as well as links to the full license text. -.. toctree:: - :maxdepth: 2 - - process/license-rules.rst +* :ref:`kernel_licensing` User-oriented documentation --------------------------- diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index a8c0873beb95..cef220c176a4 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -190,7 +190,16 @@ A few EV_REL codes have special meanings: * REL_WHEEL, REL_HWHEEL: - These codes are used for vertical and horizontal scroll wheels, - respectively. + respectively. The value is the number of "notches" moved on the wheel, the + physical size of which varies by device. For high-resolution wheels (which + report multiple events for each notch of movement, or do not have notches) + this may be an approximation based on the high-resolution scroll events. + +* REL_WHEEL_HI_RES: + + - If a vertical scroll wheel supports high-resolution scrolling, this code + will be emitted in addition to REL_WHEEL. The value is the (approximate) + distance travelled by the user's finger, in microns. EV_ABS ------ diff --git a/Documentation/ioctl/00-INDEX b/Documentation/ioctl/00-INDEX deleted file mode 100644 index c1a925787950..000000000000 --- a/Documentation/ioctl/00-INDEX +++ /dev/null @@ -1,12 +0,0 @@ -00-INDEX - - this file -botching-up-ioctls.txt - - how to avoid botching up ioctls -cdrom.txt - - summary of CDROM ioctl calls -hdio.txt - - summary of HDIO_ ioctl calls -ioctl-decoding.txt - - how to decode the bits of an IOCTL code -ioctl-number.txt - - how to implement and register device/driver ioctl calls diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 13a7c999c04a..af6f6ba1fe80 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -201,7 +201,7 @@ Code Seq#(hex) Include File Comments 'X' 01 linux/pktcdvd.h conflict! 'Y' all linux/cyclades.h 'Z' 14-15 drivers/message/fusion/mptctl.h -'[' 00-07 linux/usb/tmc.h USB Test and Measurement Devices +'[' 00-3F linux/usb/tmc.h USB Test and Measurement Devices 'a' all linux/atm*.h, linux/sonet.h ATM on linux @@ -272,6 +272,7 @@ Code Seq#(hex) Include File Comments 't' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM 'u' 00-1F linux/smb_fs.h gone 'u' 20-3F linux/uvcvideo.h USB video class host driver +'u' 40-4f linux/udmabuf.h userspace dma-buf misc device 'v' 00-1F linux/ext2_fs.h conflict! 'v' 00-1F linux/fs.h conflict! 'v' 00-0F linux/sonypi.h conflict! diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX deleted file mode 100644 index 2d1889b6c1fa..000000000000 --- a/Documentation/isdn/00-INDEX +++ /dev/null @@ -1,42 +0,0 @@ -00-INDEX - - this file (info on ISDN implementation for Linux) -CREDITS - - list of the kind folks that brought you this stuff. -HiSax.cert - - information about the ITU approval certification of the HiSax driver. -INTERFACE - - description of isdn4linux Link Level and Hardware Level interfaces. -INTERFACE.fax - - description of the fax subinterface of isdn4linux. -INTERFACE.CAPI - - description of kernel CAPI Link Level to Hardware Level interface. -README - - general info on what you need and what to do for Linux ISDN. -README.FAQ - - general info for FAQ. -README.HiSax - - info on the HiSax driver which replaces the old teles. -README.audio - - info for running audio over ISDN. -README.avmb1 - - info on driver for AVM-B1 ISDN card. -README.concap - - info on "CONCAP" encapsulation protocol interface used for X.25. -README.diversion - - info on module for isdn diversion services. -README.fax - - info for using Fax over ISDN. -README.gigaset - - info on the drivers for Siemens Gigaset ISDN adapters -README.hfc-pci - - info on hfc-pci based cards. -README.hysdn - - info on driver for Hypercope active HYSDN cards -README.mISDN - - info on the Modular ISDN subsystem (mISDN) -README.syncppp - - info on running Sync PPP over ISDN. -README.x25 - - info for running X.25 over ISDN. -syncPPP.FAQ - - frequently asked questions about running PPP over ISDN. diff --git a/Documentation/kbuild/00-INDEX b/Documentation/kbuild/00-INDEX deleted file mode 100644 index 8c5e6aa78004..000000000000 --- a/Documentation/kbuild/00-INDEX +++ /dev/null @@ -1,14 +0,0 @@ -00-INDEX - - this file: info on the kernel build process -headers_install.txt - - how to export Linux headers for use by userspace -kbuild.txt - - developer information on kbuild -kconfig.txt - - usage help for make *config -kconfig-language.txt - - specification of Config Language, the language in Kconfig files -makefiles.txt - - developer information for linux kernel makefiles -modules.txt - - how to build modules and to install them diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 7b6a2b2bdc98..8da26c6dd886 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -537,21 +537,6 @@ more details, with real examples. The third parameter may be a text as in this example, but it may also be an expanded variable or a macro. - cc-fullversion - cc-fullversion is useful when the exact version of gcc is needed. - One typical use-case is when a specific GCC version is broken. - cc-fullversion points out a more specific version than cc-version does. - - Example: - #arch/powerpc/Makefile - $(Q)if test "$(cc-fullversion)" = "040200" ; then \ - echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \ - false ; \ - fi - - In this example for a specific GCC version the build will error out - explaining to the user why it stops. - cc-cross-prefix cc-cross-prefix is used to check if there exists a $(CC) in path with one of the listed prefixes. The first prefix where there exist a diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 0f00f9c164ac..23b0c8b20cd1 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -321,7 +321,7 @@ To reduce its OS jitter, do at least one of the following: to do. Name: - rcuob/%d, rcuop/%d, and rcuos/%d + rcuop/%d and rcuos/%d Purpose: Offload RCU callbacks from the corresponding CPU. diff --git a/Documentation/laptops/00-INDEX b/Documentation/laptops/00-INDEX deleted file mode 100644 index 86169dc766f7..000000000000 --- a/Documentation/laptops/00-INDEX +++ /dev/null @@ -1,16 +0,0 @@ -00-INDEX - - This file -asus-laptop.txt - - information on the Asus Laptop Extras driver. -disk-shock-protection.txt - - information on hard disk shock protection. -laptop-mode.txt - - how to conserve battery power using laptop-mode. -sony-laptop.txt - - Sony Notebook Control Driver (SNC) Readme. -sonypi.txt - - info on Linux Sony Programmable I/O Device support. -thinkpad-acpi.txt - - information on the (IBM and Lenovo) ThinkPad ACPI Extras driver. -toshiba_haps.txt - - information on the Toshiba HDD Active Protection Sensor driver. diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst new file mode 100644 index 000000000000..e486fe7ddc35 --- /dev/null +++ b/Documentation/laptops/lg-laptop.rst @@ -0,0 +1,81 @@ +.. SPDX-License-Identifier: GPL-2.0+ +LG Gram laptop extra features +============================= + +By Matan Ziv-Av + + +Hotkeys +------- + +The following FN keys are ignored by the kernel without this driver: +- FN-F1 (LG control panel) - Generates F15 +- FN-F5 (Touchpad toggle) - Generates F13 +- FN-F6 (Airplane mode) - Generates RFKILL +- FN-F8 (Keyboard backlight) - Generates F16. + This key also changes keyboard backlight mode. +- FN-F9 (Reader mode) - Generates F14 + +The rest of the FN key work without a need for a special driver. + + +Reader mode +----------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/reader_mode disables/enables +reader mode. In this mode the screen colors change (blue color reduced), +and the reader mode indicator LED (on F9 key) turns on. + + +FN Lock +------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/fn_lock disables/enables +FN lock. + + +Battery care limit +------------------ + +Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit +sets the maximum capacity to charge the battery. Limiting the charge +reduces battery capacity loss over time. + +This value is reset to 100 when the kernel boots. + + +Fan mode +-------- + +Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables +the fan silent mode. + + +USB charge +---------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/usb_charge disables/enables +charging another device from the USB port while the device is turned off. + +This value is reset to 0 when the kernel boots. + + +LEDs +~~~~ + +The are two LED devices supported by the driver: + +Keyboard backlight +------------------ + +A led device named kbd_led controls the keyboard backlight. There are three +lighting level: off (0), low (127) and high (255). + +The keyboard backlight is also controlled by the key combination FN-F8 +which cycles through those levels. + + +Touchpad indicator LED +---------------------- + +On the F5 key. Controlled by led device names tpad_led. diff --git a/Documentation/leds/00-INDEX b/Documentation/leds/00-INDEX deleted file mode 100644 index ae626b29a740..000000000000 --- a/Documentation/leds/00-INDEX +++ /dev/null @@ -1,32 +0,0 @@ -00-INDEX - - This file -leds-blinkm.txt - - Driver for BlinkM LED-devices. -leds-class.txt - - documents LED handling under Linux. -leds-class-flash.txt - - documents flash LED handling under Linux. -leds-lm3556.txt - - notes on how to use the leds-lm3556 driver. -leds-lp3944.txt - - notes on how to use the leds-lp3944 driver. -leds-lp5521.txt - - notes on how to use the leds-lp5521 driver. -leds-lp5523.txt - - notes on how to use the leds-lp5523 driver. -leds-lp5562.txt - - notes on how to use the leds-lp5562 driver. -leds-lp55xx.txt - - description about lp55xx common driver. -leds-lm3556.txt - - notes on how to use the leds-lm3556 driver. -leds-mlxcpld.txt - - notes on how to use the leds-mlxcpld driver. -ledtrig-oneshot.txt - - One-shot LED trigger for both sporadic and dense events. -ledtrig-transient.txt - - LED Transient Trigger, one shot timer activation. -ledtrig-usbport.txt - - notes on how to use the drivers/usb/core/ledtrig-usbport.c trigger. -uleds.txt - - notes on how to use the uleds driver. diff --git a/Documentation/locking/00-INDEX b/Documentation/locking/00-INDEX deleted file mode 100644 index c256c9bee2a4..000000000000 --- a/Documentation/locking/00-INDEX +++ /dev/null @@ -1,16 +0,0 @@ -00-INDEX - - this file. -lockdep-design.txt - - documentation on the runtime locking correctness validator. -lockstat.txt - - info on collecting statistics on locks (and contention). -mutex-design.txt - - info on the generic mutex subsystem. -rt-mutex-design.txt - - description of the RealTime mutex implementation design. -rt-mutex.txt - - desc. of RT-mutex subsystem with PI (Priority Inheritance) support. -spinlocks.txt - - info on using spinlocks to provide exclusive access in kernel. -ww-mutex-design.txt - - Intro to Mutex wait/would deadlock handling.s diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt index 5786ad2cd5e6..fdbeb0c45ef3 100644 --- a/Documentation/locking/lockstat.txt +++ b/Documentation/locking/lockstat.txt @@ -91,7 +91,7 @@ Look at the current lock statistics: 07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 08 --------------- 09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 -19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 +10 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 13 --------------- diff --git a/Documentation/m68k/00-INDEX b/Documentation/m68k/00-INDEX deleted file mode 100644 index 2be8c6b00e74..000000000000 --- a/Documentation/m68k/00-INDEX +++ /dev/null @@ -1,7 +0,0 @@ -00-INDEX - - this file -README.buddha - - Amiga Buddha and Catweasel IDE Driver -kernel-options.txt - - command line options for Linux/m68k - diff --git a/Documentation/media/kapi/cec-core.rst b/Documentation/media/kapi/cec-core.rst index 1d989c544370..bca1d9d1d223 100644 --- a/Documentation/media/kapi/cec-core.rst +++ b/Documentation/media/kapi/cec-core.rst @@ -268,6 +268,10 @@ to 1, if the hardware does support retry then either set these counters to 0 if the hardware provides no feedback of which errors occurred and how many times, or fill in the correct values as reported by the hardware. +Be aware that calling these functions can immediately start a new transmit +if there is one pending in the queue. So make sure that the hardware is in +a state where new transmits can be started *before* calling these functions. + The cec_transmit_attempt_done() function is a helper for cases where the hardware never retries, so the transmit is always for just a single attempt. It will call cec_transmit_done() in turn, filling in 1 for the diff --git a/Documentation/media/kapi/mc-core.rst b/Documentation/media/kapi/mc-core.rst index 0c05503eaf1f..69362b3135c2 100644 --- a/Documentation/media/kapi/mc-core.rst +++ b/Documentation/media/kapi/mc-core.rst @@ -262,3 +262,5 @@ in the end provide a way to use driver-specific callbacks. .. kernel-doc:: include/media/media-devnode.h .. kernel-doc:: include/media/media-entity.h + +.. kernel-doc:: include/media/media-request.h diff --git a/Documentation/media/kapi/v4l2-subdev.rst b/Documentation/media/kapi/v4l2-subdev.rst index e1f0b726e438..1280e05b662b 100644 --- a/Documentation/media/kapi/v4l2-subdev.rst +++ b/Documentation/media/kapi/v4l2-subdev.rst @@ -247,20 +247,28 @@ performed using the :c:func:`v4l2_async_unregister_subdev` call. Subdevices registered this way are stored in a global list of subdevices, ready to be picked up by bridge drivers. -Bridge drivers in turn have to register a notifier object with an array of -subdevice descriptors that the bridge device needs for its operation. This is +Bridge drivers in turn have to register a notifier object. This is performed using the :c:func:`v4l2_async_notifier_register` call. To unregister the notifier the driver has to call :c:func:`v4l2_async_notifier_unregister`. The former of the two functions -takes two arguments: a pointer to struct :c:type:`v4l2_device` and a pointer to -struct :c:type:`v4l2_async_notifier`. The latter contains a pointer to an array -of pointers to subdevice descriptors of type struct :c:type:`v4l2_async_subdev` -type. The V4L2 core will then use these descriptors to match asynchronously -registered -subdevices to them. If a match is detected the ``.bound()`` notifier callback -is called. After all subdevices have been located the .complete() callback is -called. When a subdevice is removed from the system the .unbind() method is -called. All three callbacks are optional. +takes two arguments: a pointer to struct :c:type:`v4l2_device` and a +pointer to struct :c:type:`v4l2_async_notifier`. + +Before registering the notifier, bridge drivers must do two things: +first, the notifier must be initialized using the +:c:func:`v4l2_async_notifier_init`. Second, bridge drivers can then +begin to form a list of subdevice descriptors that the bridge device +needs for its operation. Subdevice descriptors are added to the notifier +using the :c:func:`v4l2_async_notifier_add_subdev` call. This function +takes two arguments: a pointer to struct :c:type:`v4l2_async_notifier`, +and a pointer to the subdevice descripter, which is of type struct +:c:type:`v4l2_async_subdev`. + +The V4L2 core will then use these descriptors to match asynchronously +registered subdevices to them. If a match is detected the ``.bound()`` +notifier callback is called. After all subdevices have been located the +.complete() callback is called. When a subdevice is removed from the +system the .unbind() method is called. All three callbacks are optional. V4L2 sub-device userspace API ----------------------------- diff --git a/Documentation/media/uapi/cec/cec-func-poll.rst b/Documentation/media/uapi/cec/cec-func-poll.rst index d49f1ee0742d..c698c969635c 100644 --- a/Documentation/media/uapi/cec/cec-func-poll.rst +++ b/Documentation/media/uapi/cec/cec-func-poll.rst @@ -74,4 +74,5 @@ is returned, and the ``errno`` variable is set appropriately: The call was interrupted by a signal. ``EINVAL`` - The ``nfds`` argument is greater than ``OPEN_MAX``. + The ``nfds`` value exceeds the ``RLIMIT_NOFILE`` value. Use + ``getrlimit()`` to obtain this value. diff --git a/Documentation/media/uapi/cec/cec-ioc-receive.rst b/Documentation/media/uapi/cec/cec-ioc-receive.rst index e964074cd15b..b25e48afaa08 100644 --- a/Documentation/media/uapi/cec/cec-ioc-receive.rst +++ b/Documentation/media/uapi/cec/cec-ioc-receive.rst @@ -16,10 +16,10 @@ CEC_RECEIVE, CEC_TRANSMIT - Receive or transmit a CEC message Synopsis ======== -.. c:function:: int ioctl( int fd, CEC_RECEIVE, struct cec_msg *argp ) +.. c:function:: int ioctl( int fd, CEC_RECEIVE, struct cec_msg \*argp ) :name: CEC_RECEIVE -.. c:function:: int ioctl( int fd, CEC_TRANSMIT, struct cec_msg *argp ) +.. c:function:: int ioctl( int fd, CEC_TRANSMIT, struct cec_msg \*argp ) :name: CEC_TRANSMIT Arguments @@ -272,6 +272,19 @@ View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV'). - The transmit failed after one or more retries. This status bit is mutually exclusive with :ref:`CEC_TX_STATUS_OK `. Other bits can still be set to explain which failures were seen. + * .. _`CEC-TX-STATUS-ABORTED`: + + - ``CEC_TX_STATUS_ABORTED`` + - 0x40 + - The transmit was aborted due to an HDMI disconnect, or the adapter + was unconfigured, or a transmit was interrupted, or the driver + returned an error when attempting to start a transmit. + * .. _`CEC-TX-STATUS-TIMEOUT`: + + - ``CEC_TX_STATUS_TIMEOUT`` + - 0x80 + - The transmit timed out. This should not normally happen and this + indicates a driver problem. .. tabularcolumns:: |p{5.6cm}|p{0.9cm}|p{11.0cm}| @@ -300,6 +313,14 @@ View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV'). - The message was received successfully but the reply was ``CEC_MSG_FEATURE_ABORT``. This status is only set if this message was the reply to an earlier transmitted message. + * .. _`CEC-RX-STATUS-ABORTED`: + + - ``CEC_RX_STATUS_ABORTED`` + - 0x08 + - The wait for a reply to an earlier transmitted message was aborted + because the HDMI cable was disconnected, the adapter was unconfigured + or the :ref:`CEC_TRANSMIT ` that waited for a + reply was interrupted. diff --git a/Documentation/media/uapi/mediactl/media-controller.rst b/Documentation/media/uapi/mediactl/media-controller.rst index 0eea4f9a07d5..66aff38cd499 100644 --- a/Documentation/media/uapi/mediactl/media-controller.rst +++ b/Documentation/media/uapi/mediactl/media-controller.rst @@ -21,6 +21,7 @@ Part IV - Media Controller API media-controller-intro media-controller-model media-types + request-api media-funcs media-header diff --git a/Documentation/media/uapi/mediactl/media-funcs.rst b/Documentation/media/uapi/mediactl/media-funcs.rst index 076856501cdb..260f9dcadcde 100644 --- a/Documentation/media/uapi/mediactl/media-funcs.rst +++ b/Documentation/media/uapi/mediactl/media-funcs.rst @@ -16,3 +16,9 @@ Function Reference media-ioc-enum-entities media-ioc-enum-links media-ioc-setup-link + media-ioc-request-alloc + request-func-close + request-func-ioctl + request-func-poll + media-request-ioc-queue + media-request-ioc-reinit diff --git a/Documentation/media/uapi/mediactl/media-ioc-device-info.rst b/Documentation/media/uapi/mediactl/media-ioc-device-info.rst index 649cb3d9e058..c6f224e404b7 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-device-info.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-device-info.rst @@ -26,6 +26,7 @@ Arguments File descriptor returned by :ref:`open() `. ``argp`` + Pointer to struct :c:type:`media_device_info`. Description diff --git a/Documentation/media/uapi/mediactl/media-ioc-enum-entities.rst b/Documentation/media/uapi/mediactl/media-ioc-enum-entities.rst index fc2e39c070c9..02738640e34e 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-enum-entities.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-enum-entities.rst @@ -26,6 +26,7 @@ Arguments File descriptor returned by :ref:`open() `. ``argp`` + Pointer to struct :c:type:`media_entity_desc`. Description diff --git a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst index f158c134e9b0..b89aaae373df 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst @@ -26,6 +26,7 @@ Arguments File descriptor returned by :ref:`open() `. ``argp`` + Pointer to struct :c:type:`media_links_enum`. Description diff --git a/Documentation/media/uapi/mediactl/media-ioc-g-topology.rst b/Documentation/media/uapi/mediactl/media-ioc-g-topology.rst index bac128c7eda9..4e1c59238371 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-g-topology.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-g-topology.rst @@ -26,6 +26,7 @@ Arguments File descriptor returned by :ref:`open() `. ``argp`` + Pointer to struct :c:type:`media_v2_topology`. Description diff --git a/Documentation/media/uapi/mediactl/media-ioc-request-alloc.rst b/Documentation/media/uapi/mediactl/media-ioc-request-alloc.rst new file mode 100644 index 000000000000..0f8b31874002 --- /dev/null +++ b/Documentation/media/uapi/mediactl/media-ioc-request-alloc.rst @@ -0,0 +1,66 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _media_ioc_request_alloc: + +***************************** +ioctl MEDIA_IOC_REQUEST_ALLOC +***************************** + +Name +==== + +MEDIA_IOC_REQUEST_ALLOC - Allocate a request + + +Synopsis +======== + +.. c:function:: int ioctl( int fd, MEDIA_IOC_REQUEST_ALLOC, int *argp ) + :name: MEDIA_IOC_REQUEST_ALLOC + + +Arguments +========= + +``fd`` + File descriptor returned by :ref:`open() `. + +``argp`` + Pointer to an integer. + + +Description +=========== + +If the media device supports :ref:`requests `, then +this ioctl can be used to allocate a request. If it is not supported, then +``errno`` is set to ``ENOTTY``. A request is accessed through a file descriptor +that is returned in ``*argp``. + +If the request was successfully allocated, then the request file descriptor +can be passed to the :ref:`VIDIOC_QBUF `, +:ref:`VIDIOC_G_EXT_CTRLS `, +:ref:`VIDIOC_S_EXT_CTRLS ` and +:ref:`VIDIOC_TRY_EXT_CTRLS ` ioctls. + +In addition, the request can be queued by calling +:ref:`MEDIA_REQUEST_IOC_QUEUE` and re-initialized by calling +:ref:`MEDIA_REQUEST_IOC_REINIT`. + +Finally, the file descriptor can be :ref:`polled ` to wait +for the request to complete. + +The request will remain allocated until all the file descriptors associated +with it are closed by :ref:`close() ` and the driver no +longer uses the request internally. See also +:ref:`here ` for more information. + +Return Value +============ + +On success 0 is returned, on error -1 and the ``errno`` variable is set +appropriately. The generic error codes are described at the +:ref:`Generic Error Codes ` chapter. + +ENOTTY + The driver has no support for requests. diff --git a/Documentation/media/uapi/mediactl/media-ioc-setup-link.rst b/Documentation/media/uapi/mediactl/media-ioc-setup-link.rst index ae5194940100..e345e7dc9ad7 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-setup-link.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-setup-link.rst @@ -26,6 +26,7 @@ Arguments File descriptor returned by :ref:`open() `. ``argp`` + Pointer to struct :c:type:`media_link_desc`. Description diff --git a/Documentation/media/uapi/mediactl/media-request-ioc-queue.rst b/Documentation/media/uapi/mediactl/media-request-ioc-queue.rst new file mode 100644 index 000000000000..6dd2d7fea714 --- /dev/null +++ b/Documentation/media/uapi/mediactl/media-request-ioc-queue.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _media_request_ioc_queue: + +***************************** +ioctl MEDIA_REQUEST_IOC_QUEUE +***************************** + +Name +==== + +MEDIA_REQUEST_IOC_QUEUE - Queue a request + + +Synopsis +======== + +.. c:function:: int ioctl( int request_fd, MEDIA_REQUEST_IOC_QUEUE ) + :name: MEDIA_REQUEST_IOC_QUEUE + + +Arguments +========= + +``request_fd`` + File descriptor returned by :ref:`MEDIA_IOC_REQUEST_ALLOC`. + + +Description +=========== + +If the media device supports :ref:`requests `, then +this request ioctl can be used to queue a previously allocated request. + +If the request was successfully queued, then the file descriptor can be +:ref:`polled ` to wait for the request to complete. + +If the request was already queued before, then ``EBUSY`` is returned. +Other errors can be returned if the contents of the request contained +invalid or inconsistent data, see the next section for a list of +common error codes. On error both the request and driver state are unchanged. + +Once a request is queued, then the driver is required to gracefully handle +errors that occur when the request is applied to the hardware. The +exception is the ``EIO`` error which signals a fatal error that requires +the application to stop streaming to reset the hardware state. + +It is not allowed to mix queuing requests with queuing buffers directly +(without a request). ``EBUSY`` will be returned if the first buffer was +queued directly and you next try to queue a request, or vice versa. + +A request must contain at least one buffer, otherwise this ioctl will +return an ``ENOENT`` error. + +Return Value +============ + +On success 0 is returned, on error -1 and the ``errno`` variable is set +appropriately. The generic error codes are described at the +:ref:`Generic Error Codes ` chapter. + +EBUSY + The request was already queued or the application queued the first + buffer directly, but later attempted to use a request. It is not permitted + to mix the two APIs. +ENOENT + The request did not contain any buffers. All requests are required + to have at least one buffer. This can also be returned if some required + configuration is missing in the request. +ENOMEM + Out of memory when allocating internal data structures for this + request. +EINVAL + The request has invalid data. +EIO + The hardware is in a bad state. To recover, the application needs to + stop streaming to reset the hardware state and then try to restart + streaming. diff --git a/Documentation/media/uapi/mediactl/media-request-ioc-reinit.rst b/Documentation/media/uapi/mediactl/media-request-ioc-reinit.rst new file mode 100644 index 000000000000..febe888494c8 --- /dev/null +++ b/Documentation/media/uapi/mediactl/media-request-ioc-reinit.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _media_request_ioc_reinit: + +****************************** +ioctl MEDIA_REQUEST_IOC_REINIT +****************************** + +Name +==== + +MEDIA_REQUEST_IOC_REINIT - Re-initialize a request + + +Synopsis +======== + +.. c:function:: int ioctl( int request_fd, MEDIA_REQUEST_IOC_REINIT ) + :name: MEDIA_REQUEST_IOC_REINIT + + +Arguments +========= + +``request_fd`` + File descriptor returned by :ref:`MEDIA_IOC_REQUEST_ALLOC`. + +Description +=========== + +If the media device supports :ref:`requests `, then +this request ioctl can be used to re-initialize a previously allocated +request. + +Re-initializing a request will clear any existing data from the request. +This avoids having to :ref:`close() ` a completed +request and allocate a new request. Instead the completed request can just +be re-initialized and it is ready to be used again. + +A request can only be re-initialized if it either has not been queued +yet, or if it was queued and completed. Otherwise it will set ``errno`` +to ``EBUSY``. No other error codes can be returned. + +Return Value +============ + +On success 0 is returned, on error -1 and the ``errno`` variable is set +appropriately. + +EBUSY + The request is queued but not yet completed. diff --git a/Documentation/media/uapi/mediactl/request-api.rst b/Documentation/media/uapi/mediactl/request-api.rst new file mode 100644 index 000000000000..5f4a23029c48 --- /dev/null +++ b/Documentation/media/uapi/mediactl/request-api.rst @@ -0,0 +1,252 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _media-request-api: + +Request API +=========== + +The Request API has been designed to allow V4L2 to deal with requirements of +modern devices (stateless codecs, complex camera pipelines, ...) and APIs +(Android Codec v2). One such requirement is the ability for devices belonging to +the same pipeline to reconfigure and collaborate closely on a per-frame basis. +Another is support of stateless codecs, which require controls to be applied +to specific frames (aka 'per-frame controls') in order to be used efficiently. + +While the initial use-case was V4L2, it can be extended to other subsystems +as well, as long as they use the media controller. + +Supporting these features without the Request API is not always possible and if +it is, it is terribly inefficient: user-space would have to flush all activity +on the media pipeline, reconfigure it for the next frame, queue the buffers to +be processed with that configuration, and wait until they are all available for +dequeuing before considering the next frame. This defeats the purpose of having +buffer queues since in practice only one buffer would be queued at a time. + +The Request API allows a specific configuration of the pipeline (media +controller topology + configuration for each media entity) to be associated with +specific buffers. This allows user-space to schedule several tasks ("requests") +with different configurations in advance, knowing that the configuration will be +applied when needed to get the expected result. Configuration values at the time +of request completion are also available for reading. + +Usage +===== + +The Request API extends the Media Controller API and cooperates with +subsystem-specific APIs to support request usage. At the Media Controller +level, requests are allocated from the supporting Media Controller device +node. Their life cycle is then managed through the request file descriptors in +an opaque way. Configuration data, buffer handles and processing results +stored in requests are accessed through subsystem-specific APIs extended for +request support, such as V4L2 APIs that take an explicit ``request_fd`` +parameter. + +Request Allocation +------------------ + +User-space allocates requests using :ref:`MEDIA_IOC_REQUEST_ALLOC` +for the media device node. This returns a file descriptor representing the +request. Typically, several such requests will be allocated. + +Request Preparation +------------------- + +Standard V4L2 ioctls can then receive a request file descriptor to express the +fact that the ioctl is part of said request, and is not to be applied +immediately. See :ref:`MEDIA_IOC_REQUEST_ALLOC` for a list of ioctls that +support this. Configurations set with a ``request_fd`` parameter are stored +instead of being immediately applied, and buffers queued to a request do not +enter the regular buffer queue until the request itself is queued. + +Request Submission +------------------ + +Once the configuration and buffers of the request are specified, it can be +queued by calling :ref:`MEDIA_REQUEST_IOC_QUEUE` on the request file descriptor. +A request must contain at least one buffer, otherwise ``ENOENT`` is returned. +A queued request cannot be modified anymore. + +.. caution:: + For :ref:`memory-to-memory devices ` you can use requests only for + output buffers, not for capture buffers. Attempting to add a capture buffer + to a request will result in an ``EACCES`` error. + +If the request contains configurations for multiple entities, individual drivers +may synchronize so the requested pipeline's topology is applied before the +buffers are processed. Media controller drivers do a best effort implementation +since perfect atomicity may not be possible due to hardware limitations. + +.. caution:: + + It is not allowed to mix queuing requests with directly queuing buffers: + whichever method is used first locks this in place until + :ref:`VIDIOC_STREAMOFF ` is called or the device is + :ref:`closed `. Attempts to directly queue a buffer when earlier + a buffer was queued via a request or vice versa will result in an ``EBUSY`` + error. + +Controls can still be set without a request and are applied immediately, +regardless of whether a request is in use or not. + +.. caution:: + + Setting the same control through a request and also directly can lead to + undefined behavior! + +User-space can :ref:`poll() ` a request file descriptor in +order to wait until the request completes. A request is considered complete +once all its associated buffers are available for dequeuing and all the +associated controls have been updated with the values at the time of completion. +Note that user-space does not need to wait for the request to complete to +dequeue its buffers: buffers that are available halfway through a request can +be dequeued independently of the request's state. + +A completed request contains the state of the device after the request was +executed. User-space can query that state by calling +:ref:`ioctl VIDIOC_G_EXT_CTRLS ` with the request file +descriptor. Calling :ref:`ioctl VIDIOC_G_EXT_CTRLS ` for a +request that has been queued but not yet completed will return ``EBUSY`` +since the control values might be changed at any time by the driver while the +request is in flight. + +.. _media-request-life-time: + +Recycling and Destruction +------------------------- + +Finally, a completed request can either be discarded or be reused. Calling +:ref:`close() ` on a request file descriptor will make +that file descriptor unusable and the request will be freed once it is no +longer in use by the kernel. That is, if the request is queued and then the +file descriptor is closed, then it won't be freed until the driver completed +the request. + +The :ref:`MEDIA_REQUEST_IOC_REINIT` will clear a request's state and make it +available again. No state is retained by this operation: the request is as +if it had just been allocated. + +Example for a Codec Device +-------------------------- + +For use-cases such as :ref:`codecs `, the request API can be used +to associate specific controls to +be applied by the driver for the OUTPUT buffer, allowing user-space +to queue many such buffers in advance. It can also take advantage of requests' +ability to capture the state of controls when the request completes to read back +information that may be subject to change. + +Put into code, after obtaining a request, user-space can assign controls and one +OUTPUT buffer to it: + +.. code-block:: c + + struct v4l2_buffer buf; + struct v4l2_ext_controls ctrls; + int req_fd; + ... + if (ioctl(media_fd, MEDIA_IOC_REQUEST_ALLOC, &req_fd)) + return errno; + ... + ctrls.which = V4L2_CTRL_WHICH_REQUEST_VAL; + ctrls.request_fd = req_fd; + if (ioctl(codec_fd, VIDIOC_S_EXT_CTRLS, &ctrls)) + return errno; + ... + buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; + buf.flags |= V4L2_BUF_FLAG_REQUEST_FD; + buf.request_fd = req_fd; + if (ioctl(codec_fd, VIDIOC_QBUF, &buf)) + return errno; + +Note that it is not allowed to use the Request API for CAPTURE buffers +since there are no per-frame settings to report there. + +Once the request is fully prepared, it can be queued to the driver: + +.. code-block:: c + + if (ioctl(req_fd, MEDIA_REQUEST_IOC_QUEUE)) + return errno; + +User-space can then either wait for the request to complete by calling poll() on +its file descriptor, or start dequeuing CAPTURE buffers. Most likely, it will +want to get CAPTURE buffers as soon as possible and this can be done using a +regular :ref:`VIDIOC_DQBUF `: + +.. code-block:: c + + struct v4l2_buffer buf; + + memset(&buf, 0, sizeof(buf)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + if (ioctl(codec_fd, VIDIOC_DQBUF, &buf)) + return errno; + +Note that this example assumes for simplicity that for every OUTPUT buffer +there will be one CAPTURE buffer, but this does not have to be the case. + +We can then, after ensuring that the request is completed via polling the +request file descriptor, query control values at the time of its completion via +a call to :ref:`VIDIOC_G_EXT_CTRLS `. +This is particularly useful for volatile controls for which we want to +query values as soon as the capture buffer is produced. + +.. code-block:: c + + struct pollfd pfd = { .events = POLLPRI, .fd = req_fd }; + poll(&pfd, 1, -1); + ... + ctrls.which = V4L2_CTRL_WHICH_REQUEST_VAL; + ctrls.request_fd = req_fd; + if (ioctl(codec_fd, VIDIOC_G_EXT_CTRLS, &ctrls)) + return errno; + +Once we don't need the request anymore, we can either recycle it for reuse with +:ref:`MEDIA_REQUEST_IOC_REINIT`... + +.. code-block:: c + + if (ioctl(req_fd, MEDIA_REQUEST_IOC_REINIT)) + return errno; + +... or close its file descriptor to completely dispose of it. + +.. code-block:: c + + close(req_fd); + +Example for a Simple Capture Device +----------------------------------- + +With a simple capture device, requests can be used to specify controls to apply +for a given CAPTURE buffer. + +.. code-block:: c + + struct v4l2_buffer buf; + struct v4l2_ext_controls ctrls; + int req_fd; + ... + if (ioctl(media_fd, MEDIA_IOC_REQUEST_ALLOC, &req_fd)) + return errno; + ... + ctrls.which = V4L2_CTRL_WHICH_REQUEST_VAL; + ctrls.request_fd = req_fd; + if (ioctl(camera_fd, VIDIOC_S_EXT_CTRLS, &ctrls)) + return errno; + ... + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + buf.flags |= V4L2_BUF_FLAG_REQUEST_FD; + buf.request_fd = req_fd; + if (ioctl(camera_fd, VIDIOC_QBUF, &buf)) + return errno; + +Once the request is fully prepared, it can be queued to the driver: + +.. code-block:: c + + if (ioctl(req_fd, MEDIA_REQUEST_IOC_QUEUE)) + return errno; + +User-space can then dequeue buffers, wait for the request completion, query +controls and recycle the request as in the M2M example above. diff --git a/Documentation/media/uapi/mediactl/request-func-close.rst b/Documentation/media/uapi/mediactl/request-func-close.rst new file mode 100644 index 000000000000..098d7f2b9548 --- /dev/null +++ b/Documentation/media/uapi/mediactl/request-func-close.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _request-func-close: + +*************** +request close() +*************** + +Name +==== + +request-close - Close a request file descriptor + + +Synopsis +======== + +.. code-block:: c + + #include + + +.. c:function:: int close( int fd ) + :name: req-close + +Arguments +========= + +``fd`` + File descriptor returned by :ref:`MEDIA_IOC_REQUEST_ALLOC`. + + +Description +=========== + +Closes the request file descriptor. Resources associated with the request +are freed once all file descriptors associated with the request are closed +and the driver has completed the request. +See :ref:`here ` for more information. + + +Return Value +============ + +:ref:`close() ` returns 0 on success. On error, -1 is +returned, and ``errno`` is set appropriately. Possible error codes are: + +EBADF + ``fd`` is not a valid open file descriptor. diff --git a/Documentation/media/uapi/mediactl/request-func-ioctl.rst b/Documentation/media/uapi/mediactl/request-func-ioctl.rst new file mode 100644 index 000000000000..ff7b072a6999 --- /dev/null +++ b/Documentation/media/uapi/mediactl/request-func-ioctl.rst @@ -0,0 +1,67 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _request-func-ioctl: + +*************** +request ioctl() +*************** + +Name +==== + +request-ioctl - Control a request file descriptor + + +Synopsis +======== + +.. code-block:: c + + #include + + +.. c:function:: int ioctl( int fd, int cmd, void *argp ) + :name: req-ioctl + +Arguments +========= + +``fd`` + File descriptor returned by :ref:`MEDIA_IOC_REQUEST_ALLOC`. + +``cmd`` + The request ioctl command code as defined in the media.h header file, for + example :ref:`MEDIA_REQUEST_IOC_QUEUE`. + +``argp`` + Pointer to a request-specific structure. + + +Description +=========== + +The :ref:`ioctl() ` function manipulates request +parameters. The argument ``fd`` must be an open file descriptor. + +The ioctl ``cmd`` code specifies the request function to be called. It +has encoded in it whether the argument is an input, output or read/write +parameter, and the size of the argument ``argp`` in bytes. + +Macros and structures definitions specifying request ioctl commands and +their parameters are located in the media.h header file. All request ioctl +commands, their respective function and parameters are specified in +:ref:`media-user-func`. + + +Return Value +============ + +On success 0 is returned, on error -1 and the ``errno`` variable is set +appropriately. The generic error codes are described at the +:ref:`Generic Error Codes ` chapter. + +Command-specific error codes are listed in the individual command +descriptions. + +When an ioctl that takes an output or read/write parameter fails, the +parameter remains unmodified. diff --git a/Documentation/media/uapi/mediactl/request-func-poll.rst b/Documentation/media/uapi/mediactl/request-func-poll.rst new file mode 100644 index 000000000000..85191254f381 --- /dev/null +++ b/Documentation/media/uapi/mediactl/request-func-poll.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-or-later WITH no-invariant-sections + +.. _request-func-poll: + +************** +request poll() +************** + +Name +==== + +request-poll - Wait for some event on a file descriptor + + +Synopsis +======== + +.. code-block:: c + + #include + + +.. c:function:: int poll( struct pollfd *ufds, unsigned int nfds, int timeout ) + :name: request-poll + +Arguments +========= + +``ufds`` + List of file descriptor events to be watched + +``nfds`` + Number of file descriptor events at the \*ufds array + +``timeout`` + Timeout to wait for events + + +Description +=========== + +With the :c:func:`poll() ` function applications can wait +for a request to complete. + +On success :c:func:`poll() ` returns the number of file +descriptors that have been selected (that is, file descriptors for which the +``revents`` field of the respective struct :c:type:`pollfd` +is non-zero). Request file descriptor set the ``POLLPRI`` flag in ``revents`` +when the request was completed. When the function times out it returns +a value of zero, on failure it returns -1 and the ``errno`` variable is +set appropriately. + +Attempting to poll for a request that is not yet queued will +set the ``POLLERR`` flag in ``revents``. + + +Return Value +============ + +On success, :c:func:`poll() ` returns the number of +structures which have non-zero ``revents`` fields, or zero if the call +timed out. On error -1 is returned, and the ``errno`` variable is set +appropriately: + +``EBADF`` + One or more of the ``ufds`` members specify an invalid file + descriptor. + +``EFAULT`` + ``ufds`` references an inaccessible memory area. + +``EINTR`` + The call was interrupted by a signal. + +``EINVAL`` + The ``nfds`` value exceeds the ``RLIMIT_NOFILE`` value. Use + ``getrlimit()`` to obtain this value. diff --git a/Documentation/media/uapi/v4l/biblio.rst b/Documentation/media/uapi/v4l/biblio.rst index 1cedcfc04327..386d6cf83e9c 100644 --- a/Documentation/media/uapi/v4l/biblio.rst +++ b/Documentation/media/uapi/v4l/biblio.rst @@ -226,16 +226,6 @@ xvYCC :author: International Electrotechnical Commission (http://www.iec.ch) -.. _adobergb: - -AdobeRGB -======== - - -:title: Adobe© RGB (1998) Color Image Encoding Version 2005-05 - -:author: Adobe Systems Incorporated (http://www.adobe.com) - .. _oprgb: opRGB diff --git a/Documentation/media/uapi/v4l/buffer.rst b/Documentation/media/uapi/v4l/buffer.rst index e2c85ddc990b..2e266d32470a 100644 --- a/Documentation/media/uapi/v4l/buffer.rst +++ b/Documentation/media/uapi/v4l/buffer.rst @@ -306,10 +306,23 @@ struct v4l2_buffer - A place holder for future extensions. Drivers and applications must set this to 0. * - __u32 - - ``reserved`` + - ``request_fd`` - - - A place holder for future extensions. Drivers and applications - must set this to 0. + - The file descriptor of the request to queue the buffer to. If the flag + ``V4L2_BUF_FLAG_REQUEST_FD`` is set, then the buffer will be + queued to this request. If the flag is not set, then this field will + be ignored. + + The ``V4L2_BUF_FLAG_REQUEST_FD`` flag and this field are only used by + :ref:`ioctl VIDIOC_QBUF ` and ignored by other ioctls that + take a :c:type:`v4l2_buffer` as argument. + + Applications should not set ``V4L2_BUF_FLAG_REQUEST_FD`` for any ioctls + other than :ref:`VIDIOC_QBUF `. + + If the device does not support requests, then ``EACCES`` will be returned. + If requests are supported but an invalid request file descriptor is + given, then ``EINVAL`` will be returned. @@ -514,6 +527,11 @@ Buffer Flags streaming may continue as normal and the buffer may be reused normally. Drivers set this flag when the ``VIDIOC_DQBUF`` ioctl is called. + * .. _`V4L2-BUF-FLAG-IN-REQUEST`: + + - ``V4L2_BUF_FLAG_IN_REQUEST`` + - 0x00000080 + - This buffer is part of a request that hasn't been queued yet. * .. _`V4L2-BUF-FLAG-KEYFRAME`: - ``V4L2_BUF_FLAG_KEYFRAME`` @@ -589,6 +607,11 @@ Buffer Flags the format. Any Any subsequent call to the :ref:`VIDIOC_DQBUF ` ioctl will not block anymore, but return an ``EPIPE`` error code. + * .. _`V4L2-BUF-FLAG-REQUEST-FD`: + + - ``V4L2_BUF_FLAG_REQUEST_FD`` + - 0x00800000 + - The ``request_fd`` field contains a valid file descriptor. * .. _`V4L2-BUF-FLAG-TIMESTAMP-MASK`: - ``V4L2_BUF_FLAG_TIMESTAMP_MASK`` diff --git a/Documentation/media/uapi/v4l/colorspaces-defs.rst b/Documentation/media/uapi/v4l/colorspaces-defs.rst index 410907fe9415..f24615544792 100644 --- a/Documentation/media/uapi/v4l/colorspaces-defs.rst +++ b/Documentation/media/uapi/v4l/colorspaces-defs.rst @@ -51,8 +51,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum - See :ref:`col-rec709`. * - ``V4L2_COLORSPACE_SRGB`` - See :ref:`col-srgb`. - * - ``V4L2_COLORSPACE_ADOBERGB`` - - See :ref:`col-adobergb`. + * - ``V4L2_COLORSPACE_OPRGB`` + - See :ref:`col-oprgb`. * - ``V4L2_COLORSPACE_BT2020`` - See :ref:`col-bt2020`. * - ``V4L2_COLORSPACE_DCI_P3`` @@ -90,8 +90,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum - Use the Rec. 709 transfer function. * - ``V4L2_XFER_FUNC_SRGB`` - Use the sRGB transfer function. - * - ``V4L2_XFER_FUNC_ADOBERGB`` - - Use the AdobeRGB transfer function. + * - ``V4L2_XFER_FUNC_OPRGB`` + - Use the opRGB transfer function. * - ``V4L2_XFER_FUNC_SMPTE240M`` - Use the SMPTE 240M transfer function. * - ``V4L2_XFER_FUNC_NONE`` diff --git a/Documentation/media/uapi/v4l/colorspaces-details.rst b/Documentation/media/uapi/v4l/colorspaces-details.rst index b5d551b9cc8f..09fabf4cd412 100644 --- a/Documentation/media/uapi/v4l/colorspaces-details.rst +++ b/Documentation/media/uapi/v4l/colorspaces-details.rst @@ -290,15 +290,14 @@ Y' is clamped to the range [0…1] and Cb and Cr are clamped to the range 170M/BT.601. The Y'CbCr quantization is limited range. -.. _col-adobergb: +.. _col-oprgb: -Colorspace Adobe RGB (V4L2_COLORSPACE_ADOBERGB) +Colorspace opRGB (V4L2_COLORSPACE_OPRGB) =============================================== -The :ref:`adobergb` standard defines the colorspace used by computer -graphics that use the AdobeRGB colorspace. This is also known as the -:ref:`oprgb` standard. The default transfer function is -``V4L2_XFER_FUNC_ADOBERGB``. The default Y'CbCr encoding is +The :ref:`oprgb` standard defines the colorspace used by computer +graphics that use the opRGB colorspace. The default transfer function is +``V4L2_XFER_FUNC_OPRGB``. The default Y'CbCr encoding is ``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is limited range. @@ -312,7 +311,7 @@ The chromaticities of the primary colors and the white reference are: .. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}| -.. flat-table:: Adobe RGB Chromaticities +.. flat-table:: opRGB Chromaticities :header-rows: 1 :stub-columns: 0 :widths: 1 1 2 diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index 9f7312bf3365..65a1d873196b 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -1497,6 +1497,182 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type - +.. _v4l2-mpeg-mpeg2: + +``V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS (struct)`` + Specifies the slice parameters (as extracted from the bitstream) for the + associated MPEG-2 slice data. This includes the necessary parameters for + configuring a stateless hardware decoding pipeline for MPEG-2. + The bitstream parameters are defined according to :ref:`mpeg2part2`. + +.. c:type:: v4l2_ctrl_mpeg2_slice_params + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_mpeg2_slice_params + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u32 + - ``bit_size`` + - Size (in bits) of the current slice data. + * - __u32 + - ``data_bit_offset`` + - Offset (in bits) to the video data in the current slice data. + * - struct :c:type:`v4l2_mpeg2_sequence` + - ``sequence`` + - Structure with MPEG-2 sequence metadata, merging relevant fields from + the sequence header and sequence extension parts of the bitstream. + * - struct :c:type:`v4l2_mpeg2_picture` + - ``picture`` + - Structure with MPEG-2 picture metadata, merging relevant fields from + the picture header and picture coding extension parts of the bitstream. + * - __u8 + - ``quantiser_scale_code`` + - Code used to determine the quantization scale to use for the IDCT. + * - __u8 + - ``backward_ref_index`` + - Index for the V4L2 buffer to use as backward reference, used with + B-coded and P-coded frames. + * - __u8 + - ``forward_ref_index`` + - Index for the V4L2 buffer to use as forward reference, used with + B-coded frames. + +.. c:type:: v4l2_mpeg2_sequence + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_mpeg2_sequence + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u16 + - ``horizontal_size`` + - The width of the displayable part of the frame's luminance component. + * - __u16 + - ``vertical_size`` + - The height of the displayable part of the frame's luminance component. + * - __u32 + - ``vbv_buffer_size`` + - Used to calculate the required size of the video buffering verifier, + defined (in bits) as: 16 * 1024 * vbv_buffer_size. + * - __u8 + - ``profile_and_level_indication`` + - The current profile and level indication as extracted from the + bitstream. + * - __u8 + - ``progressive_sequence`` + - Indication that all the frames for the sequence are progressive instead + of interlaced. + * - __u8 + - ``chroma_format`` + - The chrominance sub-sampling format (1: 4:2:0, 2: 4:2:2, 3: 4:4:4). + +.. c:type:: v4l2_mpeg2_picture + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_mpeg2_picture + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``picture_coding_type`` + - Picture coding type for the frame covered by the current slice + (V4L2_MPEG2_PICTURE_CODING_TYPE_I, V4L2_MPEG2_PICTURE_CODING_TYPE_P or + V4L2_MPEG2_PICTURE_CODING_TYPE_B). + * - __u8 + - ``f_code[2][2]`` + - Motion vector codes. + * - __u8 + - ``intra_dc_precision`` + - Precision of Discrete Cosine transform (0: 8 bits precision, + 1: 9 bits precision, 2: 10 bits precision, 3: 11 bits precision). + * - __u8 + - ``picture_structure`` + - Picture structure (1: interlaced top field, 2: interlaced bottom field, + 3: progressive frame). + * - __u8 + - ``top_field_first`` + - If set to 1 and interlaced stream, top field is output first. + * - __u8 + - ``frame_pred_frame_dct`` + - If set to 1, only frame-DCT and frame prediction are used. + * - __u8 + - ``concealment_motion_vectors`` + - If set to 1, motion vectors are coded for intra macroblocks. + * - __u8 + - ``q_scale_type`` + - This flag affects the inverse quantization process. + * - __u8 + - ``intra_vlc_format`` + - This flag affects the decoding of transform coefficient data. + * - __u8 + - ``alternate_scan`` + - This flag affects the decoding of transform coefficient data. + * - __u8 + - ``repeat_first_field`` + - This flag affects the decoding process of progressive frames. + * - __u8 + - ``progressive_frame`` + - Indicates whether the current frame is progressive. + +``V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION (struct)`` + Specifies quantization matrices (as extracted from the bitstream) for the + associated MPEG-2 slice data. + +.. c:type:: v4l2_ctrl_mpeg2_quantization + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_mpeg2_quantization + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``load_intra_quantiser_matrix`` + - One bit to indicate whether to load the ``intra_quantiser_matrix`` data. + * - __u8 + - ``load_non_intra_quantiser_matrix`` + - One bit to indicate whether to load the ``non_intra_quantiser_matrix`` + data. + * - __u8 + - ``load_chroma_intra_quantiser_matrix`` + - One bit to indicate whether to load the + ``chroma_intra_quantiser_matrix`` data, only relevant for non-4:2:0 YUV + formats. + * - __u8 + - ``load_chroma_non_intra_quantiser_matrix`` + - One bit to indicate whether to load the + ``chroma_non_intra_quantiser_matrix`` data, only relevant for non-4:2:0 + YUV formats. + * - __u8 + - ``intra_quantiser_matrix[64]`` + - The quantization matrix coefficients for intra-coded frames, in zigzag + scanning order. It is relevant for both luma and chroma components, + although it can be superseded by the chroma-specific matrix for + non-4:2:0 YUV formats. + * - __u8 + - ``non_intra_quantiser_matrix[64]`` + - The quantization matrix coefficients for non-intra-coded frames, in + zigzag scanning order. It is relevant for both luma and chroma + components, although it can be superseded by the chroma-specific matrix + for non-4:2:0 YUV formats. + * - __u8 + - ``chroma_intra_quantiser_matrix[64]`` + - The quantization matrix coefficients for the chominance component of + intra-coded frames, in zigzag scanning order. Only relevant for + non-4:2:0 YUV formats. + * - __u8 + - ``chroma_non_intra_quantiser_matrix[64]`` + - The quantization matrix coefficients for the chrominance component of + non-intra-coded frames, in zigzag scanning order. Only relevant for + non-4:2:0 YUV formats. MFC 5.1 MPEG Controls --------------------- diff --git a/Documentation/media/uapi/v4l/func-poll.rst b/Documentation/media/uapi/v4l/func-poll.rst index 360bc6523ae2..967fe8920729 100644 --- a/Documentation/media/uapi/v4l/func-poll.rst +++ b/Documentation/media/uapi/v4l/func-poll.rst @@ -113,4 +113,5 @@ EINTR The call was interrupted by a signal. EINVAL - The ``nfds`` argument is greater than ``OPEN_MAX``. + The ``nfds`` value exceeds the ``RLIMIT_NOFILE`` value. Use + ``getrlimit()`` to obtain this value. diff --git a/Documentation/media/uapi/v4l/meta-formats.rst b/Documentation/media/uapi/v4l/meta-formats.rst index 0c4e1ecf5879..cf971d5ad9ea 100644 --- a/Documentation/media/uapi/v4l/meta-formats.rst +++ b/Documentation/media/uapi/v4l/meta-formats.rst @@ -12,6 +12,7 @@ These formats are used for the :ref:`metadata` interface only. .. toctree:: :maxdepth: 1 + pixfmt-meta-d4xx pixfmt-meta-uvc pixfmt-meta-vsp1-hgo pixfmt-meta-vsp1-hgt diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst index d382e7a5c38e..ba0f6c49d9bf 100644 --- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst +++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst @@ -60,6 +60,22 @@ Compressed Formats - ``V4L2_PIX_FMT_MPEG2`` - 'MPG2' - MPEG2 video elementary stream. + * .. _V4L2-PIX-FMT-MPEG2-SLICE: + + - ``V4L2_PIX_FMT_MPEG2_SLICE`` + - 'MG2S' + - MPEG-2 parsed slice data, as extracted from the MPEG-2 bitstream. + This format is adapted for stateless video decoders that implement a + MPEG-2 pipeline (using the :ref:`codec` and :ref:`media-request-api`). + Metadata associated with the frame to decode is required to be passed + through the ``V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS`` control and + quantization matrices can optionally be specified through the + ``V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION`` control. + See the :ref:`associated Codec Control IDs `. + Exactly one output and one capture buffer must be provided for use with + this pixel format. The output buffer must contain the appropriate number + of macroblocks to decode a full corresponding frame to the matching + capture buffer. * .. _V4L2-PIX-FMT-MPEG4: - ``V4L2_PIX_FMT_MPEG4`` @@ -101,4 +117,4 @@ Compressed Formats - 'FWHT' - Video elementary stream using a codec based on the Fast Walsh Hadamard Transform. This codec is implemented by the vicodec ('Virtual Codec') - driver. See the vicodec-codec.h header for more details. + driver. See the codec-fwht.h header for more details. diff --git a/Documentation/media/uapi/v4l/pixfmt-meta-d4xx.rst b/Documentation/media/uapi/v4l/pixfmt-meta-d4xx.rst new file mode 100644 index 000000000000..63bf1a2c9116 --- /dev/null +++ b/Documentation/media/uapi/v4l/pixfmt-meta-d4xx.rst @@ -0,0 +1,210 @@ +.. -*- coding: utf-8; mode: rst -*- + +.. _v4l2-meta-fmt-d4xx: + +******************************* +V4L2_META_FMT_D4XX ('D4XX') +******************************* + +Intel D4xx UVC Cameras Metadata + + +Description +=========== + +Intel D4xx (D435 and other) cameras include per-frame metadata in their UVC +payload headers, following the Microsoft(R) UVC extension proposal [1_]. That +means, that the private D4XX metadata, following the standard UVC header, is +organised in blocks. D4XX cameras implement several standard block types, +proposed by Microsoft, and several proprietary ones. Supported standard metadata +types are MetadataId_CaptureStats (ID 3), MetadataId_CameraExtrinsics (ID 4), +and MetadataId_CameraIntrinsics (ID 5). For their description see [1_]. This +document describes proprietary metadata types, used by D4xx cameras. + +V4L2_META_FMT_D4XX buffers follow the metadata buffer layout of +V4L2_META_FMT_UVC with the only difference, that it also includes proprietary +payload header data. D4xx cameras use bulk transfers and only send one payload +per frame, therefore their headers cannot be larger than 255 bytes. + +Below are proprietary Microsoft style metadata types, used by D4xx cameras, +where all fields are in little endian order: + +.. flat-table:: D4xx metadata + :widths: 1 4 + :header-rows: 1 + :stub-columns: 0 + + * - Field + - Description + * - :cspan:`1` *Depth Control* + * - __u32 ID + - 0x80000000 + * - __u32 Size + - Size in bytes (currently 56) + * - __u32 Version + - Version of this structure. The documentation herein corresponds to + version xxx. The version number will be incremented when new fields are + added. + * - __u32 Flags + - A bitmask of flags: see [2_] below + * - __u32 Gain + - Gain value in internal units, same as the V4L2_CID_GAIN control, used to + capture the frame + * - __u32 Exposure + - Exposure time (in microseconds) used to capture the frame + * - __u32 Laser power + - Power of the laser LED 0-360, used for depth measurement + * - __u32 AE mode + - 0: manual; 1: automatic exposure + * - __u32 Exposure priority + - Exposure priority value: 0 - constant frame rate + * - __u32 AE ROI left + - Left border of the AE Region of Interest (all ROI values are in pixels + and lie between 0 and maximum width or height respectively) + * - __u32 AE ROI right + - Right border of the AE Region of Interest + * - __u32 AE ROI top + - Top border of the AE Region of Interest + * - __u32 AE ROI bottom + - Bottom border of the AE Region of Interest + * - __u32 Preset + - Preset selector value, default: 0, unless changed by the user + * - __u32 Laser mode + - 0: off, 1: on + * - :cspan:`1` *Capture Timing* + * - __u32 ID + - 0x80000001 + * - __u32 Size + - Size in bytes (currently 40) + * - __u32 Version + - Version of this structure. The documentation herein corresponds to + version xxx. The version number will be incremented when new fields are + added. + * - __u32 Flags + - A bitmask of flags: see [3_] below + * - __u32 Frame counter + - Monotonically increasing counter + * - __u32 Optical time + - Time in microseconds from the beginning of a frame till its middle + * - __u32 Readout time + - Time, used to read out a frame in microseconds + * - __u32 Exposure time + - Frame exposure time in microseconds + * - __u32 Frame interval + - In microseconds = 1000000 / framerate + * - __u32 Pipe latency + - Time in microseconds from start of frame to data in USB buffer + * - :cspan:`1` *Configuration* + * - __u32 ID + - 0x80000002 + * - __u32 Size + - Size in bytes (currently 40) + * - __u32 Version + - Version of this structure. The documentation herein corresponds to + version xxx. The version number will be incremented when new fields are + added. + * - __u32 Flags + - A bitmask of flags: see [4_] below + * - __u8 Hardware type + - Camera hardware version [5_] + * - __u8 SKU ID + - Camera hardware configuration [6_] + * - __u32 Cookie + - Internal synchronisation + * - __u16 Format + - Image format code [7_] + * - __u16 Width + - Width in pixels + * - __u16 Height + - Height in pixels + * - __u16 Framerate + - Requested frame rate per second + * - __u16 Trigger + - Byte 0: bit 0: depth and RGB are synchronised, bit 1: external trigger + +.. _1: + +[1] https://docs.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5 + +.. _2: + +[2] Depth Control flags specify which fields are valid: :: + + 0x00000001 Gain + 0x00000002 Exposure + 0x00000004 Laser power + 0x00000008 AE mode + 0x00000010 Exposure priority + 0x00000020 AE ROI + 0x00000040 Preset + +.. _3: + +[3] Capture Timing flags specify which fields are valid: :: + + 0x00000001 Frame counter + 0x00000002 Optical time + 0x00000004 Readout time + 0x00000008 Exposure time + 0x00000010 Frame interval + 0x00000020 Pipe latency + +.. _4: + +[4] Configuration flags specify which fields are valid: :: + + 0x00000001 Hardware type + 0x00000002 SKU ID + 0x00000004 Cookie + 0x00000008 Format + 0x00000010 Width + 0x00000020 Height + 0x00000040 Framerate + 0x00000080 Trigger + 0x00000100 Cal count + +.. _5: + +[5] Camera model: :: + + 0 DS5 + 1 IVCAM2 + +.. _6: + +[6] 8-bit camera hardware configuration bitfield: :: + + [1:0] depthCamera + 00: no depth + 01: standard depth + 10: wide depth + 11: reserved + [2] depthIsActive - has a laser projector + [3] RGB presence + [4] Inertial Measurement Unit (IMU) presence + [5] projectorType + 0: HPTG + 1: Princeton + [6] 0: a projector, 1: an LED + [7] reserved + +.. _7: + +[7] Image format codes per video streaming interface: + +Depth: :: + + 1 Z16 + 2 Z + +Left sensor: :: + + 1 Y8 + 2 UYVY + 3 R8L8 + 4 Calibration + 5 W10 + +Fish Eye sensor: :: + + 1 RAW8 diff --git a/Documentation/media/uapi/v4l/pixfmt-reserved.rst b/Documentation/media/uapi/v4l/pixfmt-reserved.rst index 38af1472a4b4..0c399858bda2 100644 --- a/Documentation/media/uapi/v4l/pixfmt-reserved.rst +++ b/Documentation/media/uapi/v4l/pixfmt-reserved.rst @@ -243,7 +243,20 @@ please make a proposal on the linux-media mailing list. It is an opaque intermediate format and the MDP hardware must be used to convert ``V4L2_PIX_FMT_MT21C`` to ``V4L2_PIX_FMT_NV12M``, ``V4L2_PIX_FMT_YUV420M`` or ``V4L2_PIX_FMT_YVU420``. - + * .. _V4L2-PIX-FMT-SUNXI-TILED-NV12: + + - ``V4L2_PIX_FMT_SUNXI_TILED_NV12`` + - 'ST12' + - Two-planar NV12-based format used by the video engine found on Allwinner + (codenamed sunxi) platforms, with 32x32 tiles for the luminance plane + and 32x64 tiles for the chrominance plane. The data in each tile is + stored in linear order, within the tile bounds. Each tile follows the + previous one linearly in memory (from left to right, top to bottom). + + The associated buffer dimensions are aligned to match an integer number + of tiles, resulting in 32-aligned resolutions for the luminance plane + and 16-aligned resolutions for the chrominance plane (with 2x2 + subsampling). .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}| diff --git a/Documentation/media/uapi/v4l/vidioc-create-bufs.rst b/Documentation/media/uapi/v4l/vidioc-create-bufs.rst index a39e18d69511..eadf6f757fbf 100644 --- a/Documentation/media/uapi/v4l/vidioc-create-bufs.rst +++ b/Documentation/media/uapi/v4l/vidioc-create-bufs.rst @@ -102,7 +102,19 @@ than the number requested. - ``format`` - Filled in by the application, preserved by the driver. * - __u32 - - ``reserved``\ [8] + - ``capabilities`` + - Set by the driver. If 0, then the driver doesn't support + capabilities. In that case all you know is that the driver is + guaranteed to support ``V4L2_MEMORY_MMAP`` and *might* support + other :c:type:`v4l2_memory` types. It will not support any others + capabilities. See :ref:`here ` for a list of the + capabilities. + + If you want to just query the capabilities without making any + other changes, then set ``count`` to 0, ``memory`` to + ``V4L2_MEMORY_MMAP`` and ``format.type`` to the buffer type. + * - __u32 + - ``reserved``\ [7] - A place holder for future extensions. Drivers and applications must set the array to zero. diff --git a/Documentation/media/uapi/v4l/vidioc-cropcap.rst b/Documentation/media/uapi/v4l/vidioc-cropcap.rst index a65dbec6b20b..0a7b8287fd38 100644 --- a/Documentation/media/uapi/v4l/vidioc-cropcap.rst +++ b/Documentation/media/uapi/v4l/vidioc-cropcap.rst @@ -58,7 +58,7 @@ overlay devices. - Type of the data stream, set by the application. Only these types are valid here: ``V4L2_BUF_TYPE_VIDEO_CAPTURE``, ``V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE``, ``V4L2_BUF_TYPE_VIDEO_OUTPUT``, ``V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE`` and - ``V4L2_BUF_TYPE_VIDEO_OVERLAY``. See :c:type:`v4l2_buf_type` and the note above. + ``V4L2_BUF_TYPE_VIDEO_OVERLAY``. See :c:type:`v4l2_buf_type` and the note below. * - struct :ref:`v4l2_rect ` - ``bounds`` - Defines the window within capturing or output is possible, this diff --git a/Documentation/media/uapi/v4l/vidioc-dqevent.rst b/Documentation/media/uapi/v4l/vidioc-dqevent.rst index cb3565f36793..04416b6943c0 100644 --- a/Documentation/media/uapi/v4l/vidioc-dqevent.rst +++ b/Documentation/media/uapi/v4l/vidioc-dqevent.rst @@ -379,7 +379,17 @@ call. - 0x0001 - This event gets triggered when a resolution change is detected at an input. This can come from an input connector or from a video - decoder. + decoder. Applications will have to query the new resolution (if + any, the signal may also have been lost). + + *Important*: even if the new video timings appear identical to the old + ones, receiving this event indicates that there was an issue with the + video signal and you must stop and restart streaming + (:ref:`VIDIOC_STREAMOFF ` + followed by :ref:`VIDIOC_STREAMON `). The reason is + that many devices are not able to recover from a temporary loss of + signal and so restarting streaming I/O is required in order for the + hardware to synchronize to the video signal. Return Value diff --git a/Documentation/media/uapi/v4l/vidioc-g-crop.rst b/Documentation/media/uapi/v4l/vidioc-g-crop.rst index a6ed43ba9ca3..b95ba6743cbd 100644 --- a/Documentation/media/uapi/v4l/vidioc-g-crop.rst +++ b/Documentation/media/uapi/v4l/vidioc-g-crop.rst @@ -84,7 +84,7 @@ When cropping is not supported then no parameters are changed and - Type of the data stream, set by the application. Only these types are valid here: ``V4L2_BUF_TYPE_VIDEO_CAPTURE``, ``V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE``, ``V4L2_BUF_TYPE_VIDEO_OUTPUT``, ``V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE`` and - ``V4L2_BUF_TYPE_VIDEO_OVERLAY``. See :c:type:`v4l2_buf_type` and the note above. + ``V4L2_BUF_TYPE_VIDEO_OVERLAY``. See :c:type:`v4l2_buf_type` and the note below. * - struct :c:type:`v4l2_rect` - ``c`` - Cropping rectangle. The same co-ordinate system as for struct diff --git a/Documentation/media/uapi/v4l/vidioc-g-dv-timings.rst b/Documentation/media/uapi/v4l/vidioc-g-dv-timings.rst index 1a034e825161..35cba2c8d459 100644 --- a/Documentation/media/uapi/v4l/vidioc-g-dv-timings.rst +++ b/Documentation/media/uapi/v4l/vidioc-g-dv-timings.rst @@ -257,14 +257,19 @@ EBUSY will also be cleared. This is a read-only flag, applications must not set this. * - ``V4L2_DV_FL_REDUCED_FPS`` - - CEA-861 specific: only valid for video transmitters, the flag is - cleared by receivers. It is also only valid for formats with the - ``V4L2_DV_FL_CAN_REDUCE_FPS`` flag set, for other formats the - flag will be cleared by the driver. If the application sets this - flag, then the pixelclock used to set up the transmitter is - divided by 1.001 to make it compatible with NTSC framerates. If - the transmitter can't generate such frequencies, then the flag - will also be cleared. + - CEA-861 specific: only valid for video transmitters or video + receivers that have the ``V4L2_DV_FL_CAN_DETECT_REDUCED_FPS`` + set. This flag is cleared otherwise. It is also only valid for + formats with the ``V4L2_DV_FL_CAN_REDUCE_FPS`` flag set, for other + formats the flag will be cleared by the driver. + + If the application sets this flag for a transmitter, then the + pixelclock used to set up the transmitter is divided by 1.001 to + make it compatible with NTSC framerates. If the transmitter can't + generate such frequencies, then the flag will be cleared. + + If a video receiver detects that the format uses a reduced framerate, + then it will set this flag to signal this to the application. * - ``V4L2_DV_FL_HALF_LINE`` - Specific to interlaced formats: if set, then the vertical backporch of field 1 (aka the odd field) is really one half-line @@ -294,3 +299,9 @@ EBUSY - If set, then the hdmi_vic field is valid and contains the Video Identification Code as per the HDMI standard (HDMI Vendor Specific InfoFrame). + * - ``V4L2_DV_FL_CAN_DETECT_REDUCED_FPS`` + - CEA-861 specific: only valid for video receivers, the flag is + cleared by transmitters. + If set, then the hardware can detect the difference between + regular framerates and framerates reduced by 1000/1001. E.g.: + 60 vs 59.94 Hz, 30 vs 29.97 Hz or 24 vs 23.976 Hz. diff --git a/Documentation/media/uapi/v4l/vidioc-g-ext-ctrls.rst b/Documentation/media/uapi/v4l/vidioc-g-ext-ctrls.rst index 2011c2b2ee67..d9930fe776cf 100644 --- a/Documentation/media/uapi/v4l/vidioc-g-ext-ctrls.rst +++ b/Documentation/media/uapi/v4l/vidioc-g-ext-ctrls.rst @@ -95,6 +95,25 @@ appropriate. In the first case the new value is set in struct is inappropriate (e.g. the given menu index is not supported by the menu control), then this will also result in an ``EINVAL`` error code error. +If ``request_fd`` is set to a not-yet-queued :ref:`request ` +file descriptor and ``which`` is set to ``V4L2_CTRL_WHICH_REQUEST_VAL``, +then the controls are not applied immediately when calling +:ref:`VIDIOC_S_EXT_CTRLS `, but instead are applied by +the driver for the buffer associated with the same request. +If the device does not support requests, then ``EACCES`` will be returned. +If requests are supported but an invalid request file descriptor is given, +then ``EINVAL`` will be returned. + +An attempt to call :ref:`VIDIOC_S_EXT_CTRLS ` for a +request that has already been queued will result in an ``EBUSY`` error. + +If ``request_fd`` is specified and ``which`` is set to +``V4L2_CTRL_WHICH_REQUEST_VAL`` during a call to +:ref:`VIDIOC_G_EXT_CTRLS `, then it will return the +values of the controls at the time of request completion. +If the request is not yet completed, then this will result in an +``EACCES`` error. + The driver will only set/get these controls if all control values are correct. This prevents the situation where only some of the controls were set/get. Only low-level errors (e. g. a failed i2c command) can @@ -209,13 +228,17 @@ still cause this situation. - ``which`` - Which value of the control to get/set/try. ``V4L2_CTRL_WHICH_CUR_VAL`` will return the current value of the - control and ``V4L2_CTRL_WHICH_DEF_VAL`` will return the default - value of the control. + control, ``V4L2_CTRL_WHICH_DEF_VAL`` will return the default + value of the control and ``V4L2_CTRL_WHICH_REQUEST_VAL`` indicates that + these controls have to be retrieved from a request or tried/set for + a request. In the latter case the ``request_fd`` field contains the + file descriptor of the request that should be used. If the device + does not support requests, then ``EACCES`` will be returned. .. note:: - You can only get the default value of the control, - you cannot set or try it. + When using ``V4L2_CTRL_WHICH_DEF_VAL`` be aware that you can only + get the default value of the control, you cannot set or try it. For backwards compatibility you can also use a control class here (see :ref:`ctrl-class`). In that case all controls have to @@ -272,8 +295,15 @@ still cause this situation. then you can call :ref:`VIDIOC_TRY_EXT_CTRLS ` to try to discover the actual control that failed the validation step. Unfortunately, there is no ``TRY`` equivalent for :ref:`VIDIOC_G_EXT_CTRLS `. + * - __s32 + - ``request_fd`` + - File descriptor of the request to be used by this operation. Only + valid if ``which`` is set to ``V4L2_CTRL_WHICH_REQUEST_VAL``. + If the device does not support requests, then ``EACCES`` will be returned. + If requests are supported but an invalid request file descriptor is + given, then ``EINVAL`` will be returned. * - __u32 - - ``reserved``\ [2] + - ``reserved``\ [1] - Reserved for future extensions. Drivers and applications must set the array to zero. @@ -347,11 +377,14 @@ appropriately. The generic error codes are described at the EINVAL The struct :c:type:`v4l2_ext_control` ``id`` is - invalid, the struct :c:type:`v4l2_ext_controls` + invalid, or the struct :c:type:`v4l2_ext_controls` ``which`` is invalid, or the struct :c:type:`v4l2_ext_control` ``value`` was inappropriate (e.g. the given menu index is not supported by the - driver). This error code is also returned by the + driver), or the ``which`` field was set to ``V4L2_CTRL_WHICH_REQUEST_VAL`` + but the given ``request_fd`` was invalid or ``V4L2_CTRL_WHICH_REQUEST_VAL`` + is not supported by the kernel. + This error code is also returned by the :ref:`VIDIOC_S_EXT_CTRLS ` and :ref:`VIDIOC_TRY_EXT_CTRLS ` ioctls if two or more control values are in conflict. @@ -362,7 +395,9 @@ ERANGE EBUSY The control is temporarily not changeable, possibly because another applications took over control of the device function this control - belongs to. + belongs to, or (if the ``which`` field was set to + ``V4L2_CTRL_WHICH_REQUEST_VAL``) the request was queued but not yet + completed. ENOSPC The space reserved for the control's payload is insufficient. The @@ -370,5 +405,9 @@ ENOSPC and this error code is returned. EACCES - Attempt to try or set a read-only control or to get a write-only - control. + Attempt to try or set a read-only control, or to get a write-only + control, or to get a control from a request that has not yet been + completed. + + Or the ``which`` field was set to ``V4L2_CTRL_WHICH_REQUEST_VAL`` but the + device does not support requests. diff --git a/Documentation/media/uapi/v4l/vidioc-qbuf.rst b/Documentation/media/uapi/v4l/vidioc-qbuf.rst index 9e448a4aa3aa..753b3b5946b1 100644 --- a/Documentation/media/uapi/v4l/vidioc-qbuf.rst +++ b/Documentation/media/uapi/v4l/vidioc-qbuf.rst @@ -65,7 +65,7 @@ To enqueue a :ref:`memory mapped ` buffer applications set the with a pointer to this structure the driver sets the ``V4L2_BUF_FLAG_MAPPED`` and ``V4L2_BUF_FLAG_QUEUED`` flags and clears the ``V4L2_BUF_FLAG_DONE`` flag in the ``flags`` field, or it returns an -EINVAL error code. +``EINVAL`` error code. To enqueue a :ref:`user pointer ` buffer applications set the ``memory`` field to ``V4L2_MEMORY_USERPTR``, the ``m.userptr`` field to @@ -98,6 +98,28 @@ dequeued, until the :ref:`VIDIOC_STREAMOFF ` or :ref:`VIDIOC_REQBUFS` ioctl is called, or until the device is closed. +The ``request_fd`` field can be used with the ``VIDIOC_QBUF`` ioctl to specify +the file descriptor of a :ref:`request `, if requests are +in use. Setting it means that the buffer will not be passed to the driver +until the request itself is queued. Also, the driver will apply any +settings associated with the request for this buffer. This field will +be ignored unless the ``V4L2_BUF_FLAG_REQUEST_FD`` flag is set. +If the device does not support requests, then ``EACCES`` will be returned. +If requests are supported but an invalid request file descriptor is given, +then ``EINVAL`` will be returned. + +.. caution:: + It is not allowed to mix queuing requests with queuing buffers directly. + ``EBUSY`` will be returned if the first buffer was queued directly and + then the application tries to queue a request, or vice versa. After + closing the file descriptor, calling + :ref:`VIDIOC_STREAMOFF ` or calling :ref:`VIDIOC_REQBUFS` + the check for this will be reset. + + For :ref:`memory-to-memory devices ` you can specify the + ``request_fd`` only for output buffers, not for capture buffers. Attempting + to specify this for a capture buffer will result in an ``EACCES`` error. + Applications call the ``VIDIOC_DQBUF`` ioctl to dequeue a filled (capturing) or displayed (output) buffer from the driver's outgoing queue. They just set the ``type``, ``memory`` and ``reserved`` fields of @@ -133,7 +155,9 @@ EAGAIN EINVAL The buffer ``type`` is not supported, or the ``index`` is out of bounds, or no buffers have been allocated yet, or the ``userptr`` or - ``length`` are invalid. + ``length`` are invalid, or the ``V4L2_BUF_FLAG_REQUEST_FD`` flag was + set but the the given ``request_fd`` was invalid, or ``m.fd`` was + an invalid DMABUF file descriptor. EIO ``VIDIOC_DQBUF`` failed due to an internal error. Can also indicate @@ -153,3 +177,12 @@ EPIPE ``VIDIOC_DQBUF`` returns this on an empty capture queue for mem2mem codecs if a buffer with the ``V4L2_BUF_FLAG_LAST`` was already dequeued and no new buffers are expected to become available. + +EACCES + The ``V4L2_BUF_FLAG_REQUEST_FD`` flag was set but the device does not + support requests for the given buffer type. + +EBUSY + The first buffer was queued via a request, but the application now tries + to queue it directly, or vice versa (it is not permitted to mix the two + APIs). diff --git a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst index 5bd26e8c9a1a..258f5813f281 100644 --- a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst +++ b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst @@ -424,8 +424,18 @@ See also the examples in :ref:`control`. - any - An unsigned 32-bit valued control ranging from minimum to maximum inclusive. The step value indicates the increment between values. - - + * - ``V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_mpeg2_slice_params`, containing MPEG-2 + slice parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_MPEG2_QUANTIZATION`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_mpeg2_quantization`, containing MPEG-2 + quantization matrices for stateless video decoders. .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}| diff --git a/Documentation/media/uapi/v4l/vidioc-reqbufs.rst b/Documentation/media/uapi/v4l/vidioc-reqbufs.rst index 316f52c8a310..d4bbbb0c60e8 100644 --- a/Documentation/media/uapi/v4l/vidioc-reqbufs.rst +++ b/Documentation/media/uapi/v4l/vidioc-reqbufs.rst @@ -88,10 +88,50 @@ any DMA in progress, an implicit ``V4L2_MEMORY_DMABUF`` or ``V4L2_MEMORY_USERPTR``. See :c:type:`v4l2_memory`. * - __u32 - - ``reserved``\ [2] + - ``capabilities`` + - Set by the driver. If 0, then the driver doesn't support + capabilities. In that case all you know is that the driver is + guaranteed to support ``V4L2_MEMORY_MMAP`` and *might* support + other :c:type:`v4l2_memory` types. It will not support any others + capabilities. + + If you want to query the capabilities with a minimum of side-effects, + then this can be called with ``count`` set to 0, ``memory`` set to + ``V4L2_MEMORY_MMAP`` and ``type`` set to the buffer type. This will + free any previously allocated buffers, so this is typically something + that will be done at the start of the application. + * - __u32 + - ``reserved``\ [1] - A place holder for future extensions. Drivers and applications must set the array to zero. +.. tabularcolumns:: |p{6.1cm}|p{2.2cm}|p{8.7cm}| + +.. _v4l2-buf-capabilities: +.. _V4L2-BUF-CAP-SUPPORTS-MMAP: +.. _V4L2-BUF-CAP-SUPPORTS-USERPTR: +.. _V4L2-BUF-CAP-SUPPORTS-DMABUF: +.. _V4L2-BUF-CAP-SUPPORTS-REQUESTS: + +.. cssclass:: longtable + +.. flat-table:: V4L2 Buffer Capabilities Flags + :header-rows: 0 + :stub-columns: 0 + :widths: 3 1 4 + + * - ``V4L2_BUF_CAP_SUPPORTS_MMAP`` + - 0x00000001 + - This buffer type supports the ``V4L2_MEMORY_MMAP`` streaming mode. + * - ``V4L2_BUF_CAP_SUPPORTS_USERPTR`` + - 0x00000002 + - This buffer type supports the ``V4L2_MEMORY_USERPTR`` streaming mode. + * - ``V4L2_BUF_CAP_SUPPORTS_DMABUF`` + - 0x00000004 + - This buffer type supports the ``V4L2_MEMORY_DMABUF`` streaming mode. + * - ``V4L2_BUF_CAP_SUPPORTS_REQUESTS`` + - 0x00000008 + - This buffer type supports :ref:`requests `. Return Value ============ diff --git a/Documentation/media/videodev2.h.rst.exceptions b/Documentation/media/videodev2.h.rst.exceptions index ca9f0edc579e..1ec425a7c364 100644 --- a/Documentation/media/videodev2.h.rst.exceptions +++ b/Documentation/media/videodev2.h.rst.exceptions @@ -56,7 +56,8 @@ replace symbol V4L2_MEMORY_USERPTR :c:type:`v4l2_memory` # Documented enum v4l2_colorspace replace symbol V4L2_COLORSPACE_470_SYSTEM_BG :c:type:`v4l2_colorspace` replace symbol V4L2_COLORSPACE_470_SYSTEM_M :c:type:`v4l2_colorspace` -replace symbol V4L2_COLORSPACE_ADOBERGB :c:type:`v4l2_colorspace` +replace symbol V4L2_COLORSPACE_OPRGB :c:type:`v4l2_colorspace` +replace define V4L2_COLORSPACE_ADOBERGB :c:type:`v4l2_colorspace` replace symbol V4L2_COLORSPACE_BT2020 :c:type:`v4l2_colorspace` replace symbol V4L2_COLORSPACE_DCI_P3 :c:type:`v4l2_colorspace` replace symbol V4L2_COLORSPACE_DEFAULT :c:type:`v4l2_colorspace` @@ -69,7 +70,8 @@ replace symbol V4L2_COLORSPACE_SRGB :c:type:`v4l2_colorspace` # Documented enum v4l2_xfer_func replace symbol V4L2_XFER_FUNC_709 :c:type:`v4l2_xfer_func` -replace symbol V4L2_XFER_FUNC_ADOBERGB :c:type:`v4l2_xfer_func` +replace symbol V4L2_XFER_FUNC_OPRGB :c:type:`v4l2_xfer_func` +replace define V4L2_XFER_FUNC_ADOBERGB :c:type:`v4l2_xfer_func` replace symbol V4L2_XFER_FUNC_DCI_P3 :c:type:`v4l2_xfer_func` replace symbol V4L2_XFER_FUNC_DEFAULT :c:type:`v4l2_xfer_func` replace symbol V4L2_XFER_FUNC_NONE :c:type:`v4l2_xfer_func` @@ -129,6 +131,8 @@ replace symbol V4L2_CTRL_TYPE_STRING :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_U16 :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_U32 :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_U8 :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_MPEG2_QUANTIZATION :c:type:`v4l2_ctrl_type` # V4L2 capability defines replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities @@ -278,6 +282,7 @@ replace define V4L2_DV_BT_STD_SDI dv-bt-standards replace define V4L2_DV_FL_REDUCED_BLANKING dv-bt-standards replace define V4L2_DV_FL_CAN_REDUCE_FPS dv-bt-standards +replace define V4L2_DV_FL_CAN_DETECT_REDUCED_FPS dv-bt-standards replace define V4L2_DV_FL_REDUCED_FPS dv-bt-standards replace define V4L2_DV_FL_HALF_LINE dv-bt-standards replace define V4L2_DV_FL_IS_CE_VIDEO dv-bt-standards @@ -514,6 +519,7 @@ ignore define V4L2_CTRL_DRIVER_PRIV ignore define V4L2_CTRL_MAX_DIMS ignore define V4L2_CTRL_WHICH_CUR_VAL ignore define V4L2_CTRL_WHICH_DEF_VAL +ignore define V4L2_CTRL_WHICH_REQUEST_VAL ignore define V4L2_OUT_CAP_CUSTOM_TIMINGS ignore define V4L2_CID_MAX_CTRLS diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 0d8d7ef131e9..c1d913944ad8 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -471,8 +471,7 @@ And a couple of implicit varieties: operations after the ACQUIRE operation will appear to happen after the ACQUIRE operation with respect to the other components of the system. ACQUIRE operations include LOCK operations and both smp_load_acquire() - and smp_cond_acquire() operations. The later builds the necessary ACQUIRE - semantics from relying on a control dependency and smp_rmb(). + and smp_cond_load_acquire() operations. Memory operations that occur before an ACQUIRE operation may appear to happen after it completes. diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX deleted file mode 100644 index 8ae9cffc2262..000000000000 --- a/Documentation/mips/00-INDEX +++ /dev/null @@ -1,4 +0,0 @@ -00-INDEX - - this file. -AU1xxx_IDE.README - - README for MIPS AU1XXX IDE driver. diff --git a/Documentation/mmc/00-INDEX b/Documentation/mmc/00-INDEX deleted file mode 100644 index 4623bc0aa0bb..000000000000 --- a/Documentation/mmc/00-INDEX +++ /dev/null @@ -1,10 +0,0 @@ -00-INDEX - - this file -mmc-dev-attrs.txt - - info on SD and MMC device attributes -mmc-dev-parts.txt - - info on SD and MMC device partitions -mmc-async-req.txt - - info on mmc asynchronous requests -mmc-tools.txt - - info on mmc-utils tools diff --git a/Documentation/mtd/nand/pxa3xx-nand.txt b/Documentation/mtd/nand/pxa3xx-nand.txt deleted file mode 100644 index 1074cbc67ec6..000000000000 --- a/Documentation/mtd/nand/pxa3xx-nand.txt +++ /dev/null @@ -1,113 +0,0 @@ - -About this document -=================== - -Some notes about Marvell's NAND controller available in PXA and Armada 370/XP -SoC (aka NFCv1 and NFCv2), with an emphasis on the latter. - -NFCv2 controller background -=========================== - -The controller has a 2176 bytes FIFO buffer. Therefore, in order to support -larger pages, I/O operations on 4 KiB and 8 KiB pages is done with a set of -chunked transfers. - -For instance, if we choose a 2048 data chunk and set "BCH" ECC (see below) -we'll have this layout in the pages: - - ------------------------------------------------------------------------------ - | 2048B data | 32B spare | 30B ECC || 2048B data | 32B spare | 30B ECC | ... | - ------------------------------------------------------------------------------ - -The driver reads the data and spare portions independently and builds an internal -buffer with this layout (in the 4 KiB page case): - - ------------------------------------------ - | 4096B data | 64B spare | - ------------------------------------------ - -Also, for the READOOB command the driver disables the ECC and reads a 'spare + ECC' -OOB, one per chunk read. - - ------------------------------------------------------------------- - | 4096B data | 32B spare | 30B ECC | 32B spare | 30B ECC | - ------------------------------------------------------------------- - -So, in order to achieve reading (for instance), we issue several READ0 commands -(with some additional controller-specific magic) and read two chunks of 2080B -(2048 data + 32 spare) each. -The driver accommodates this data to expose the NAND core a contiguous buffer -(4096 data + spare) or (4096 + spare + ECC + spare + ECC). - -ECC -=== - -The controller has built-in hardware ECC capabilities. In addition it is -configurable between two modes: 1) Hamming, 2) BCH. - -Note that the actual BCH mode: BCH-4 or BCH-8 will depend on the way -the controller is configured to transfer the data. - -In the BCH mode the ECC code will be calculated for each transferred chunk -and expected to be located (when reading/programming) right after the spare -bytes as the figure above shows. - -So, repeating the above scheme, a 2048B data chunk will be followed by 32B -spare, and then the ECC controller will read/write the ECC code (30B in -this case): - - ------------------------------------ - | 2048B data | 32B spare | 30B ECC | - ------------------------------------ - -If the ECC mode is 'BCH' then the ECC is *always* 30 bytes long. -If the ECC mode is 'Hamming' the ECC is 6 bytes long, for each 512B block. -So in Hamming mode, a 2048B page will have a 24B ECC. - -Despite all of the above, the controller requires the driver to only read or -write in multiples of 8-bytes, because the data buffer is 64-bits. - -OOB -=== - -Because of the above scheme, and because the "spare" OOB is really located in -the middle of a page, spare OOB cannot be read or write independently of the -data area. In other words, in order to read the OOB (aka READOOB), the entire -page (aka READ0) has to be read. - -In the same sense, in order to write to the spare OOB the driver has to write -an *entire* page. - -Factory bad blocks handling -=========================== - -Given the ECC BCH requires to layout the device's pages in a split -data/OOB/data/OOB way, the controller has a view of the flash page that's -different from the specified (aka the manufacturer's) view. In other words, - -Factory view: - - ----------------------------------------------- - | Data |x OOB | - ----------------------------------------------- - -Driver's view: - - ----------------------------------------------- - | Data | OOB | Data x | OOB | - ----------------------------------------------- - -It can be seen from the above, that the factory bad block marker must be -searched within the 'data' region, and not in the usual OOB region. - -In addition, this means under regular usage the driver will write such -position (since it belongs to the data region) and every used block is -likely to be marked as bad. - -For this reason, marking the block as bad in the OOB is explicitly -disabled by using the NAND_BBT_NO_OOB_BBM option in the driver. The rationale -for this is that there's no point in marking a block as bad, because good -blocks are also 'marked as bad' (in the OOB BBM sense) under normal usage. - -Instead, the driver relies on the bad block table alone, and should only perform -the bad block scan on the very first time (when the device hasn't been used). diff --git a/Documentation/netlabel/00-INDEX b/Documentation/netlabel/00-INDEX deleted file mode 100644 index 837bf35990e2..000000000000 --- a/Documentation/netlabel/00-INDEX +++ /dev/null @@ -1,10 +0,0 @@ -00-INDEX - - this file. -cipso_ipv4.txt - - documentation on the IPv4 CIPSO protocol engine. -draft-ietf-cipso-ipsecurity-01.txt - - IETF draft of the CIPSO protocol, dated 16 July 1992. -introduction.txt - - NetLabel introduction, READ THIS FIRST. -lsm_interface.txt - - documentation on the NetLabel kernel security module API. diff --git a/Documentation/netlabel/cipso_ipv4.txt b/Documentation/netlabel/cipso_ipv4.txt index 93dacb132c3c..a6075481fd60 100644 --- a/Documentation/netlabel/cipso_ipv4.txt +++ b/Documentation/netlabel/cipso_ipv4.txt @@ -6,11 +6,12 @@ May 17, 2006 * Overview -The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial IP -Security Option (CIPSO) draft from July 16, 1992. A copy of this draft can be -found in this directory, consult '00-INDEX' for the filename. While the IETF -draft never made it to an RFC standard it has become a de-facto standard for -labeled networking and is used in many trusted operating systems. +The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial +IP Security Option (CIPSO) draft from July 16, 1992. A copy of this +draft can be found in this directory +(draft-ietf-cipso-ipsecurity-01.txt). While the IETF draft never made +it to an RFC standard it has become a de-facto standard for labeled +networking and is used in many trusted operating systems. * Outbound Packet Processing diff --git a/Documentation/netlabel/introduction.txt b/Documentation/netlabel/introduction.txt index 5ecd8d1dcf4e..3caf77bcff0f 100644 --- a/Documentation/netlabel/introduction.txt +++ b/Documentation/netlabel/introduction.txt @@ -22,7 +22,7 @@ refrain from calling the protocol engines directly, instead they should use the NetLabel kernel security module API described below. Detailed information about each NetLabel protocol engine can be found in this -directory, consult '00-INDEX' for filenames. +directory. * Communication Layer diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX deleted file mode 100644 index 02a323c43261..000000000000 --- a/Documentation/networking/00-INDEX +++ /dev/null @@ -1,234 +0,0 @@ -00-INDEX - - this file -3c509.txt - - information on the 3Com Etherlink III Series Ethernet cards. -6pack.txt - - info on the 6pack protocol, an alternative to KISS for AX.25 -LICENSE.qla3xxx - - GPLv2 for QLogic Linux Networking HBA Driver -LICENSE.qlge - - GPLv2 for QLogic Linux qlge NIC Driver -LICENSE.qlcnic - - GPLv2 for QLogic Linux qlcnic NIC Driver -PLIP.txt - - PLIP: The Parallel Line Internet Protocol device driver -README.ipw2100 - - README for the Intel PRO/Wireless 2100 driver. -README.ipw2200 - - README for the Intel PRO/Wireless 2915ABG and 2200BG driver. -README.sb1000 - - info on General Instrument/NextLevel SURFboard1000 cable modem. -altera_tse.txt - - Altera Triple-Speed Ethernet controller. -arcnet-hardware.txt - - tons of info on ARCnet, hubs, jumper settings for ARCnet cards, etc. -arcnet.txt - - info on the using the ARCnet driver itself. -atm.txt - - info on where to get ATM programs and support for Linux. -ax25.txt - - info on using AX.25 and NET/ROM code for Linux -baycom.txt - - info on the driver for Baycom style amateur radio modems -bonding.txt - - Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux. -bridge.txt - - where to get user space programs for ethernet bridging with Linux. -cdc_mbim.txt - - 3G/LTE USB modem (Mobile Broadband Interface Model) -checksum-offloads.txt - - Explanation of checksum offloads; LCO, RCO -cops.txt - - info on the COPS LocalTalk Linux driver -cs89x0.txt - - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver -cxacru.txt - - Conexant AccessRunner USB ADSL Modem -cxacru-cf.py - - Conexant AccessRunner USB ADSL Modem configuration file parser -cxgb.txt - - Release Notes for the Chelsio N210 Linux device driver. -dccp.txt - - the Datagram Congestion Control Protocol (DCCP) (RFC 4340..42). -dctcp.txt - - DataCenter TCP congestion control -de4x5.txt - - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver -decnet.txt - - info on using the DECnet networking layer in Linux. -dl2k.txt - - README for D-Link DL2000-based Gigabit Ethernet Adapters (dl2k.ko). -dm9000.txt - - README for the Simtec DM9000 Network driver. -dmfe.txt - - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver. -dns_resolver.txt - - The DNS resolver module allows kernel servies to make DNS queries. -driver.txt - - Softnet driver issues. -ena.txt - - info on Amazon's Elastic Network Adapter (ENA) -e100.txt - - info on Intel's EtherExpress PRO/100 line of 10/100 boards -e1000.txt - - info on Intel's E1000 line of gigabit ethernet boards -e1000e.txt - - README for the Intel Gigabit Ethernet Driver (e1000e). -eql.txt - - serial IP load balancing -fib_trie.txt - - Level Compressed Trie (LC-trie) notes: a structure for routing. -filter.txt - - Linux Socket Filtering -fore200e.txt - - FORE Systems PCA-200E/SBA-200E ATM NIC driver info. -framerelay.txt - - info on using Frame Relay/Data Link Connection Identifier (DLCI). -gen_stats.txt - - Generic networking statistics for netlink users. -generic-hdlc.txt - - The generic High Level Data Link Control (HDLC) layer. -generic_netlink.txt - - info on Generic Netlink -gianfar.txt - - Gianfar Ethernet Driver. -i40e.txt - - README for the Intel Ethernet Controller XL710 Driver (i40e). -i40evf.txt - - Short note on the Driver for the Intel(R) XL710 X710 Virtual Function -ieee802154.txt - - Linux IEEE 802.15.4 implementation, API and drivers -igb.txt - - README for the Intel Gigabit Ethernet Driver (igb). -igbvf.txt - - README for the Intel Gigabit Ethernet Driver (igbvf). -ip-sysctl.txt - - /proc/sys/net/ipv4/* variables -ip_dynaddr.txt - - IP dynamic address hack e.g. for auto-dialup links -ipddp.txt - - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation -iphase.txt - - Interphase PCI ATM (i)Chip IA Linux driver info. -ipsec.txt - - Note on not compressing IPSec payload and resulting failed policy check. -ipv6.txt - - Options to the ipv6 kernel module. -ipvs-sysctl.txt - - Per-inode explanation of the /proc/sys/net/ipv4/vs interface. -irda.txt - - where to get IrDA (infrared) utilities and info for Linux. -ixgb.txt - - README for the Intel 10 Gigabit Ethernet Driver (ixgb). -ixgbe.txt - - README for the Intel 10 Gigabit Ethernet Driver (ixgbe). -ixgbevf.txt - - README for the Intel Virtual Function (VF) Driver (ixgbevf). -l2tp.txt - - User guide to the L2TP tunnel protocol. -lapb-module.txt - - programming information of the LAPB module. -ltpc.txt - - the Apple or Farallon LocalTalk PC card driver -mac80211-auth-assoc-deauth.txt - - authentication and association / deauth-disassoc with max80211 -mac80211-injection.txt - - HOWTO use packet injection with mac80211 -multiqueue.txt - - HOWTO for multiqueue network device support. -netconsole.txt - - The network console module netconsole.ko: configuration and notes. -netdev-features.txt - - Network interface features API description. -netdevices.txt - - info on network device driver functions exported to the kernel. -netif-msg.txt - - Design of the network interface message level setting (NETIF_MSG_*). -netlink_mmap.txt - - memory mapped I/O with netlink -nf_conntrack-sysctl.txt - - list of netfilter-sysctl knobs. -nfc.txt - - The Linux Near Field Communication (NFS) subsystem. -openvswitch.txt - - Open vSwitch developer documentation. -operstates.txt - - Overview of network interface operational states. -packet_mmap.txt - - User guide to memory mapped packet socket rings (PACKET_[RT]X_RING). -phonet.txt - - The Phonet packet protocol used in Nokia cellular modems. -phy.txt - - The PHY abstraction layer. -pktgen.txt - - User guide to the kernel packet generator (pktgen.ko). -policy-routing.txt - - IP policy-based routing -ppp_generic.txt - - Information about the generic PPP driver. -proc_net_tcp.txt - - Per inode overview of the /proc/net/tcp and /proc/net/tcp6 interfaces. -radiotap-headers.txt - - Background on radiotap headers. -ray_cs.txt - - Raylink Wireless LAN card driver info. -rds.txt - - Background on the reliable, ordered datagram delivery method RDS. -regulatory.txt - - Overview of the Linux wireless regulatory infrastructure. -rxrpc.txt - - Guide to the RxRPC protocol. -s2io.txt - - Release notes for Neterion Xframe I/II 10GbE driver. -scaling.txt - - Explanation of network scaling techniques: RSS, RPS, RFS, aRFS, XPS. -sctp.txt - - Notes on the Linux kernel implementation of the SCTP protocol. -secid.txt - - Explanation of the secid member in flow structures. -skfp.txt - - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info. -smc9.txt - - the driver for SMC's 9000 series of Ethernet cards -spider_net.txt - - README for the Spidernet Driver (as found in PS3 / Cell BE). -stmmac.txt - - README for the STMicro Synopsys Ethernet driver. -tc-actions-env-rules.txt - - rules for traffic control (tc) actions. -timestamping.txt - - overview of network packet timestamping variants. -tcp.txt - - short blurb on how TCP output takes place. -tcp-thin.txt - - kernel tuning options for low rate 'thin' TCP streams. -team.txt - - pointer to information for ethernet teaming devices. -tlan.txt - - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info. -tproxy.txt - - Transparent proxy support user guide. -tuntap.txt - - TUN/TAP device driver, allowing user space Rx/Tx of packets. -udplite.txt - - UDP-Lite protocol (RFC 3828) introduction. -vortex.txt - - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards. -vxge.txt - - README for the Neterion X3100 PCIe Server Adapter. -vxlan.txt - - Virtual extensible LAN overview -x25.txt - - general info on X.25 development. -x25-iface.txt - - description of the X.25 Packet Layer to LAPB device interface. -xfrm_device.txt - - description of XFRM offload API -xfrm_proc.txt - - description of the statistics package for XFRM. -xfrm_sync.txt - - sync patches for XFRM enable migration of an SA between hosts. -xfrm_sysctl.txt - - description of the XFRM configuration options. -z8530drv.txt - - info about Linux driver for Z8530 based HDLC cards for AX.25 diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index ff929cfab4f4..4ae4f9d8f8fe 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -159,8 +159,8 @@ log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050 and 3000 refers to the same chunk. -UMEM Completetion Ring -~~~~~~~~~~~~~~~~~~~~~~ +UMEM Completion Ring +~~~~~~~~~~~~~~~~~~~~ The Completion Ring is used transfer ownership of UMEM frames from kernel-space to user-space. Just like the Fill ring, UMEM indicies are diff --git a/Documentation/networking/defza.txt b/Documentation/networking/defza.txt new file mode 100644 index 000000000000..663e4a906751 --- /dev/null +++ b/Documentation/networking/defza.txt @@ -0,0 +1,57 @@ +Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver v.1.1.4. + + +DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI +network card, designed in 1990 specifically for the DECstation 5000 +model 200 workstation. The board is a single attachment station and +it was manufactured in two variations, both of which are supported. + +First is the SAS MMF DEFZA-AA option, the original design implementing +the standard MMF-PMD, however with a pair of ST connectors rather than +the usual MIC connector. The other one is the SAS ThinWire/STP DEFZA-CA +option, denoted 700-C, with the network medium selectable by a switch +between the DEC proprietary ThinWire-PMD using a BNC connector and the +standard STP-PMD using a DE-9F connector. This option can interface to +a DECconcentrator 500 device and, in the case of the STP-PMD, also other +FDDI equipment and was designed to make it easier to transition from +existing IEEE 802.3 10BASE2 Ethernet and IEEE 802.5 Token Ring networks +by providing means to reuse existing cabling. + +This driver handles any number of cards installed in a single system. +They get fddi0, fddi1, etc. interface names assigned in the order of +increasing TURBOchannel slot numbers. + +The board only supports DMA on the receive side. Transmission involves +the use of PIO. As a result under a heavy transmission load there will +be a significant impact on system performance. + +The board supports a 64-entry CAM for matching destination addresses. +Two entries are preoccupied by the Directed Beacon and Ring Purger +multicast addresses and the rest is used as a multicast filter. An +all-multi mode is also supported for LLC frames and it is used if +requested explicitly or if the CAM overflows. The promiscuous mode +supports separate enables for LLC and SMT frames, but this driver +doesn't support changing them individually. + + +Known problems: + +None. + + +To do: + +5. MAC address change. The card does not support changing the Media + Access Controller's address registers but a similar effect can be + achieved by adding an alias to the CAM. There is no way to disable + matching against the original address though. + +7. Queueing incoming/outgoing SMT frames in the driver if the SMT + receive/RMC transmit ring is full. (?) + +8. Retrieving/reporting FDDI/SNMP stats. + + +Both success and failure reports are welcome. + +Maciej W. Rozycki diff --git a/Documentation/networking/devlink-params-bnxt.txt b/Documentation/networking/devlink-params-bnxt.txt new file mode 100644 index 000000000000..481aa303d5b4 --- /dev/null +++ b/Documentation/networking/devlink-params-bnxt.txt @@ -0,0 +1,18 @@ +enable_sriov [DEVICE, GENERIC] + Configuration mode: Permanent + +ignore_ari [DEVICE, GENERIC] + Configuration mode: Permanent + +msix_vec_per_pf_max [DEVICE, GENERIC] + Configuration mode: Permanent + +msix_vec_per_pf_min [DEVICE, GENERIC] + Configuration mode: Permanent + +gre_ver_check [DEVICE, DRIVER-SPECIFIC] + Generic Routing Encapsulation (GRE) version check will + be enabled in the device. If disabled, device skips + version checking for incoming packets. + Type: Boolean + Configuration mode: Permanent diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt new file mode 100644 index 000000000000..ae444ffe73ac --- /dev/null +++ b/Documentation/networking/devlink-params.txt @@ -0,0 +1,42 @@ +Devlink configuration parameters +================================ +Following is the list of configuration parameters via devlink interface. +Each parameter can be generic or driver specific and are device level +parameters. + +Note that the driver-specific files should contain the generic params +they support to, with supported config modes. + +Each parameter can be set in different configuration modes: + runtime - set while driver is running, no reset required. + driverinit - applied while driver initializes, requires restart + driver by devlink reload command. + permanent - written to device's non-volatile memory, hard reset + required. + +Following is the list of parameters: +==================================== +enable_sriov [DEVICE, GENERIC] + Enable Single Root I/O Virtualisation (SRIOV) in + the device. + Type: Boolean + +ignore_ari [DEVICE, GENERIC] + Ignore Alternative Routing-ID Interpretation (ARI) + capability. If enabled, adapter will ignore ARI + capability even when platforms has the support + enabled and creates same number of partitions when + platform does not support ARI. + Type: Boolean + +msix_vec_per_pf_max [DEVICE, GENERIC] + Provides the maximum number of MSIX interrupts that + a device can create. Value is same across all + physical functions (PFs) in the device. + Type: u32 + +msix_vec_per_pf_min [DEVICE, GENERIC] + Provides the minimum number of MSIX interrupts required + for the device initialization. Value is same across all + physical functions (PFs) in the device. + Type: u32 diff --git a/drivers/staging/fsl-dpaa2/ethernet/ethernet-driver.rst b/Documentation/networking/dpaa2/ethernet-driver.rst similarity index 100% rename from drivers/staging/fsl-dpaa2/ethernet/ethernet-driver.rst rename to Documentation/networking/dpaa2/ethernet-driver.rst diff --git a/Documentation/networking/dpaa2/index.rst b/Documentation/networking/dpaa2/index.rst index 10bea113a7bc..67bd87fe6c53 100644 --- a/Documentation/networking/dpaa2/index.rst +++ b/Documentation/networking/dpaa2/index.rst @@ -7,3 +7,4 @@ DPAA2 Documentation overview dpio-driver + ethernet-driver diff --git a/Documentation/networking/e100.rst b/Documentation/networking/e100.rst index f81111eba9c5..5e2839b4ec92 100644 --- a/Documentation/networking/e100.rst +++ b/Documentation/networking/e100.rst @@ -1,4 +1,5 @@ -============================================================== +.. SPDX-License-Identifier: GPL-2.0+ + Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters ============================================================== diff --git a/Documentation/networking/e1000.rst b/Documentation/networking/e1000.rst index f10dd4086921..6379d4d20771 100644 --- a/Documentation/networking/e1000.rst +++ b/Documentation/networking/e1000.rst @@ -1,4 +1,5 @@ -=========================================================== +.. SPDX-License-Identifier: GPL-2.0+ + Linux* Base Driver for Intel(R) Ethernet Network Connection =========================================================== diff --git a/Documentation/networking/e1000e.rst b/Documentation/networking/e1000e.rst new file mode 100644 index 000000000000..33554e5416c5 --- /dev/null +++ b/Documentation/networking/e1000e.rst @@ -0,0 +1,382 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +Linux* Driver for Intel(R) Ethernet Network Connection +====================================================== + +Intel Gigabit Linux driver. +Copyright(c) 2008-2018 Intel Corporation. + +Contents +======== + +- Identifying Your Adapter +- Command Line Parameters +- Additional Configurations +- Support + + +Identifying Your Adapter +======================== +For information on how to identify your adapter, and for the latest Intel +network drivers, refer to the Intel Support website: +https://www.intel.com/support + + +Command Line Parameters +======================= +If the driver is built as a module, the following optional parameters are used +by entering them on the command line with the modprobe command using this +syntax:: + + modprobe e1000e [

    , @[+|-],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + "\t $stack, $stack, $retval, $comm, $arg\n" +#else "\t $stack, $stack, $retval, $comm\n" - "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" - "\t b@/\n" +#endif + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" + "\t b@/,\n" + "\t \\[\\]\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 69a3fe926e8c..76217bbef815 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -290,7 +290,8 @@ void perf_kprobe_destroy(struct perf_event *p_event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS -int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; @@ -312,8 +313,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } - tp_event = create_local_trace_uprobe( - path, p_event->attr.probe_offset, is_retprobe); + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 85f6b01431c7..eb908ef2ecec 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -738,16 +738,30 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(char *field_type, - char *field_name) +static struct synth_field *parse_synth_field(int argc, char **argv, + int *consumed) { struct synth_field *field; + const char *prefix = NULL; + char *field_type = argv[0], *field_name; int len, ret = 0; char *array; if (field_type[0] == ';') field_type++; + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + len = strlen(field_name); if (field_name[len - 1] == ';') field_name[len - 1] = '\0'; @@ -760,11 +774,15 @@ static struct synth_field *parse_synth_field(char *field_type, array = strchr(field_name, '['); if (array) len += strlen(array); + if (prefix) + len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); if (!field->type) { ret = -ENOMEM; goto free; } + if (prefix) + strcat(field->type, prefix); strcat(field->type, field_type); if (array) { strcat(field->type, array); @@ -1009,7 +1027,7 @@ static int create_synth_event(int argc, char **argv) struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; bool delete_event = false; - int i, n_fields = 0, ret = 0; + int i, consumed = 0, n_fields = 0, ret = 0; char *name; mutex_lock(&synth_event_mutex); @@ -1045,8 +1063,10 @@ static int create_synth_event(int argc, char **argv) event = NULL; ret = -EEXIST; goto out; - } else if (delete_event) + } else if (delete_event) { + ret = -ENOENT; goto out; + } if (argc < 2) { ret = -EINVAL; @@ -1061,16 +1081,16 @@ static int create_synth_event(int argc, char **argv) goto err; } - field = parse_synth_field(argv[i], argv[i + 1]); + field = parse_synth_field(argc - i, &argv[i], &consumed); if (IS_ERR(field)) { ret = PTR_ERR(field); goto err; } - fields[n_fields] = field; - i++; n_fields++; + fields[n_fields++] = field; + i += consumed - 1; } - if (i < argc) { + if (i < argc && strcmp(argv[i], ";") != 0) { ret = -EINVAL; goto err; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c30032367aab..fec67188c4d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -14,6 +14,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -61,9 +62,23 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { - return !!strchr(trace_kprobe_symbol(tk), ':'); + char *p; + bool ret; + + if (!tk->symbol) + return false; + p = strchr(tk->symbol, ':'); + if (!p) + return true; + *p = '\0'; + mutex_lock(&module_mutex); + ret = !!find_module(tk->symbol); + mutex_unlock(&module_mutex); + *p = ':'; + + return ret; } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -120,184 +135,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -/* - * Kprobes-specific fetch functions - */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); - -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); - -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(dst, addr, maxlen); - - if (ret < 0) { /* Failed to fetch string */ - dst[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); - } -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); - -/* Return the length of string -- including null terminal byte */ -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); - -#define DEFINE_FETCH_symbol(type) \ -void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); - -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* kprobes don't support file_offset fetch methods */ -#define fetch_file_offset_u8 NULL -#define fetch_file_offset_u16 NULL -#define fetch_file_offset_u32 NULL -#define fetch_file_offset_u64 NULL -#define fetch_file_offset_string NULL -#define fetch_file_offset_string_size NULL - -/* Fetch type information table */ -static const struct fetch_type kprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; - /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -540,8 +377,11 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; } - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_update_arg(&tk->tp.args[i]); + for (i = 0; i < tk->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tk->tp.args[i]); + if (ret) + return ret; + } /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(&tk->tp)) @@ -554,19 +394,13 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) + if (ret == 0) { tk->tp.flags |= TP_FLAG_REGISTERED; - else { - if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); - ret = 0; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } + } else if (ret == -EILSEQ) { + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); + ret = -EINVAL; } - return ret; } @@ -629,6 +463,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register k*probe */ ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + if (ret < 0) unregister_kprobe_event(tk); else @@ -713,13 +552,15 @@ static int create_trace_kprobe(int argc, char **argv) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = false; - else if (argv[0][0] == 'r') + else if (argv[0][0] == 'r') { is_return = true; - else if (argv[0][0] == '-') + flags |= TPARG_FL_RETURN; + } else if (argv[0][0] == '-') is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" @@ -749,10 +590,13 @@ static int create_trace_kprobe(int argc, char **argv) } if (event) { - if (strchr(event, '/')) { + char *slash; + + slash = strchr(event, '/'); + if (slash) { group = event; - event = strchr(group, '/') + 1; - event[-1] = '\0'; + event = slash + 1; + slash[0] = '\0'; if (strlen(group) == 0) { pr_info("Group name is not specified\n"); return -EINVAL; @@ -802,8 +646,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse either an address or a symbol.\n"); return ret; } - if (offset && is_return && - !kprobe_on_func_entry(NULL, symbol, offset)) { + if (kprobe_on_func_entry(NULL, symbol, offset)) + flags |= TPARG_FL_FENTRY; + if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } @@ -873,8 +718,7 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true, - kprobes_fetch_type_table); + flags); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -1028,6 +872,106 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Kprobe specific fetch functions */ + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + u8 *dst = get_loc_data(dest, base); + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + return ret; +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + return probe_kernel_read(dest, src, size); +} + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, @@ -1059,7 +1003,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry = ring_buffer_event_data(event); entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1108,7 +1052,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1133,8 +1077,6 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1146,11 +1088,9 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); out: @@ -1164,8 +1104,6 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1182,11 +1120,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -1197,49 +1133,25 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, static int kprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kretprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1286,7 +1198,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1322,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -1457,7 +1369,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) init_trace_event_call(tk, call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); if (!ret) { @@ -1514,7 +1426,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk, &tk->tp.call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b0875b327f5c..c3fd849d4a8f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -115,7 +115,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, * section, then we need to read the link list pointers. The trick is * we pass the address of the string to the seq function just like * we do for the kernel core formats. To get back the structure that - * holds the format, we simply use containerof() and then go to the + * holds the format, we simply use container_of() and then go to the * next format in the list. */ static const char ** diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e99c3ce7aa65..bd30e9398d2a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -26,14 +26,12 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ -int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ { \ - trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, fmt, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") @@ -48,193 +46,52 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) +{ + trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data); + return !trace_seq_has_overflowed(s); +} +const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS"; + /* Print type function for string type */ -int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) { int len = *(u32 *)data >> 16; if (!len) - trace_seq_printf(s, " %s=(fault)", name); + trace_seq_puts(s, "(fault)"); else - trace_seq_printf(s, " %s=\"%s\"", name, + trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); return !trace_seq_has_overflowed(s); } -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; - fetch_func_t fetch; - fetch_func_t fetch_size; +/* Fetch type information table */ +static const struct fetch_type probe_fetch_types[] = { + /* Special types */ + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + "__data_loc char[]"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), + + ASSIGN_FETCH_TYPE_END }; -#define DEFINE_FETCH_deref(type) \ -void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - dprm->fetch(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) - -void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - struct deref_fetch_param *dprm = data; - unsigned long addr; - - call_fetch(&dprm->orig, regs, &addr); - if (addr && dprm->fetch_size) { - addr += dprm->offset; - dprm->fetch_size(regs, (void *)addr, dest); - } else - *(string_size *)dest = 0; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); - -static void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} -NOKPROBE_SYMBOL(update_deref_fetch_param); - -static void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} -NOKPROBE_SYMBOL(free_deref_fetch_param); - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - - kfree(data); -} - -void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, - void *data, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - ret = strlcpy(dst, current->comm, maxlen); - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); - -void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - *(u32 *)dest = strlen(current->comm) + 1; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); - -static const struct fetch_type *find_fetch_type(const char *type, - const struct fetch_type *ftbl) +static const struct fetch_type *find_fetch_type(const char *type) { int i; @@ -255,58 +112,27 @@ static const struct fetch_type *find_fetch_type(const char *type, switch (bs) { case 8: - return find_fetch_type("u8", ftbl); + return find_fetch_type("u8"); case 16: - return find_fetch_type("u16", ftbl); + return find_fetch_type("u16"); case 32: - return find_fetch_type("u32", ftbl); + return find_fetch_type("u32"); case 64: - return find_fetch_type("u64", ftbl); + return find_fetch_type("u64"); default: goto fail; } } - for (i = 0; ftbl[i].name; i++) { - if (strcmp(type, ftbl[i].name) == 0) - return &ftbl[i]; + for (i = 0; probe_fetch_types[i].name; i++) { + if (strcmp(type, probe_fetch_types[i].name) == 0) + return &probe_fetch_types[i]; } fail: return NULL; } -/* Special function : only accept unsigned long */ -static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_kernel_stack_address); - -static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = user_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_user_stack_address); - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn, - const struct fetch_type *ftbl) -{ - int i; - - if (type != &ftbl[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - - return NULL; -} - /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -331,41 +157,44 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, - bool is_kprobe) + struct fetch_insn *code, unsigned int flags) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; + if (flags & TPARG_FL_RETURN) + code->op = FETCH_OP_RETVAL; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) - return -EINVAL; - - if (is_kprobe) - f->fn = fetch_kernel_stack_address; - else - f->fn = fetch_user_stack_address; + code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + if (ret || ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK)) ret = -EINVAL; else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; } } else ret = -EINVAL; } else if (strcmp(arg, "comm") == 0) { - if (strcmp(t->name, "string") != 0 && - strcmp(t->name, "string_size") != 0) + code->op = FETCH_OP_COMM; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + } else if (((flags & TPARG_FL_MASK) == + (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + strncmp(arg, "arg", 3) == 0) { + if (!isdigit(arg[3])) + return -EINVAL; + ret = kstrtoul(arg + 3, 10, ¶m); + if (ret || !param || param > PARAM_MAX_STACK) return -EINVAL; - f->fn = t->fetch[FETCH_MTD_comm]; + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param - 1; +#endif } else ret = -EINVAL; @@ -373,25 +202,27 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } /* Recursive argument parser */ -static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) +static int +parse_probe_arg(char *arg, const struct fetch_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + unsigned int flags) { + struct fetch_insn *code = *pcode; unsigned long param; - long offset; + long offset = 0; char *tmp; int ret = 0; switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + ret = parse_probe_vars(arg + 1, type, code, flags); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; + code->op = FETCH_OP_REG; + code->param = (unsigned int)ret; ret = 0; } break; @@ -401,33 +232,42 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; - - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; + /* load address */ + code->op = FETCH_OP_IMM; + code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (is_kprobe) + if (flags & TPARG_FL_KERNEL) return -EINVAL; ret = kstrtol(arg + 2, 0, &offset); if (ret) break; - f->fn = t->fetch[FETCH_MTD_file_offset]; - f->data = (void *)offset; + code->op = FETCH_OP_FOFFS; + code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!is_kprobe) + if (!(flags & TPARG_FL_KERNEL)) return -EINVAL; - ret = traceprobe_split_symbol_offset(arg + 1, &offset); - if (ret) - break; + /* Preserve symbol for updating */ + code->op = FETCH_NOP_SYMBOL; + code->data = kstrdup(arg + 1, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + if (++code == end) + return -E2BIG; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; + code->op = FETCH_OP_IMM; + code->immediate = 0; } + /* These are fetching from memory */ + if (++code == end) + return -E2BIG; + *pcode = code; + code->op = FETCH_OP_DEREF; + code->offset = offset; break; case '+': /* deref memory */ @@ -435,11 +275,10 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '-': tmp = strchr(arg, '('); if (!tmp) - break; + return -EINVAL; *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) break; @@ -447,36 +286,27 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, tmp = strrchr(arg, ')'); if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2; + const struct fetch_type *t2 = find_fetch_type(NULL); - t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); - - if (!dprm) - return -ENOMEM; - - dprm->offset = offset; - dprm->fetch = t->fetch[FETCH_MTD_memory]; - dprm->fetch_size = get_fetch_size_function(t, - dprm->fetch, ftbl); - ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe, ftbl); + ret = parse_probe_arg(arg, t2, &code, end, flags); if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } + break; + if (code->op == FETCH_OP_COMM) + return -EINVAL; + if (++code == end) + return -E2BIG; + *pcode = code; + + code->op = FETCH_OP_DEREF; + code->offset = offset; } break; } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", t->name); + if (!ret && code->op == FETCH_OP_NOP) { + /* Parsed, but do not find fetch method */ ret = -EINVAL; } - return ret; } @@ -485,22 +315,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, - struct fetch_param *f) + struct fetch_insn **pcode) { - struct bitfield_fetch_param *bprm; + struct fetch_insn *code = *pcode; unsigned long bw, bo; char *tail; if (*bf != 'b') return 0; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ if (bw == 0 || *tail != '@') @@ -511,20 +334,26 @@ static int __parse_bitfield_probe_arg(const char *bf, if (tail == bf || *tail != '/') return -EINVAL; + code++; + if (code->op != FETCH_OP_NOP) + return -E2BIG; + *pcode = code; - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; + code->op = FETCH_OP_MOD_BF; + code->lshift = BYTES_TO_BITS(t->size) - (bw + bo); + code->rshift = BYTES_TO_BITS(t->size) - bw; + code->basesize = t->size; return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) + struct probe_arg *parg, unsigned int flags) { - const char *t; - int ret; + struct fetch_insn *code, *scode, *tmp = NULL; + char *t, *t2; + int ret, len; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -535,36 +364,128 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, pr_info("Failed to allocate memory for command '%s'.\n", arg); return -ENOMEM; } - t = strchr(parg->comm, ':'); + t = strchr(arg, ':'); if (t) { - arg[t - parg->comm] = '\0'; - t++; + *t = '\0'; + t2 = strchr(++t, '['); + if (t2) { + *t2 = '\0'; + parg->count = simple_strtoul(t2 + 1, &t2, 0); + if (strcmp(t2, "]") || parg->count == 0) + return -EINVAL; + if (parg->count > MAX_ARRAY_LEN) + return -E2BIG; + } } /* * The default type of $comm should be "string", and it can't be * dereferenced. */ if (!t && strcmp(arg, "$comm") == 0) - t = "string"; - parg->type = find_fetch_type(t, ftbl); + parg->type = find_fetch_type("string"); + else + parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; } parg->offset = *size; - *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, - is_kprobe, ftbl); - - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn, - ftbl); - parg->fetch_size.data = parg->fetch.data; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) + return -ENOMEM; + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + if (!code) + return -ENOMEM; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + flags); + if (ret) + goto fail; + + /* Store operation */ + if (!strcmp(parg->type->name, "string")) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && + code->op != FETCH_OP_COMM) { + pr_info("string only accepts memory or address.\n"); + ret = -EINVAL; + goto fail; + } + if (code->op != FETCH_OP_DEREF || parg->count) { + /* + * IMM and COMM is pointing actual address, those must + * be kept, and if parg->count != 0, this is an array + * of string pointers instead of string address itself. + */ + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + } + code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + code->size = parg->type->size; + parg->dynamic = true; + } else if (code->op == FETCH_OP_DEREF) { + code->op = FETCH_OP_ST_MEM; + code->size = parg->type->size; + } else { + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_ST_RAW; + code->size = parg->type->size; + } + scode = code; + /* Modify operation */ + if (t != NULL) { + ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (ret) + goto fail; } + /* Loop(Array) operation */ + if (parg->count) { + if (scode->op != FETCH_OP_ST_MEM && + scode->op != FETCH_OP_ST_STRING) { + pr_info("array only accepts memory or address\n"); + ret = -EINVAL; + goto fail; + } + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_LP_ARRAY; + code->param = parg->count; + } + code++; + code->op = FETCH_OP_END; + + /* Shrink down the code buffer */ + parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + if (!parg->code) + ret = -ENOMEM; + else + memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); + +fail: + if (ret) { + for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + } + kfree(tmp); return ret; } @@ -586,35 +507,63 @@ int traceprobe_conflict_field_name(const char *name, return 0; } -void traceprobe_update_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - void traceprobe_free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); + struct fetch_insn *code = arg->code; + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + code++; + } + kfree(arg->code); kfree(arg->name); kfree(arg->comm); + kfree(arg->fmt); } +int traceprobe_update_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + long offset; + char *tmp; + char c; + int ret = 0; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) { + if (code[1].op != FETCH_OP_IMM) + return -EINVAL; + + tmp = strpbrk(code->data, "+-"); + if (tmp) + c = *tmp; + ret = traceprobe_split_symbol_offset(code->data, + &offset); + if (ret) + return ret; + + code[1].immediate = + (unsigned long)kallsyms_lookup_name(code->data); + if (tmp) + *tmp = c; + if (!code[1].immediate) + return -ENOENT; + code[1].immediate += offset; + } + code++; + } + return 0; +} + +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { - int i; + struct probe_arg *parg; + int i, j; int pos = 0; - const char *fmt, *arg; if (!is_return) { @@ -625,35 +574,51 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; } - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); + parg = tp->args + i; + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name); + if (parg->count) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s", + parg->type->fmt); + for (j = 1; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s", + parg->type->fmt); + pos += snprintf(buf + pos, LEN_OR_ZERO, "}"); + } else + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", + parg->type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) + parg = tp->args + i; + if (parg->count) { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s[%d])"; + else + fmt = ", REC->%s[%d]"; + for (j = 0; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name, j); + } else { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s)"; + else + fmt = ", REC->%s"; pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + fmt, parg->name); + } } -#undef LEN_OR_ZERO - /* return the length of print_fmt */ return pos; } +#undef LEN_OR_ZERO -int set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { int len; char *print_fmt; @@ -670,3 +635,28 @@ int set_print_fmt(struct trace_probe *tp, bool is_return) return 0; } + +int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp) +{ + int ret, i; + + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + struct probe_arg *parg = &tp->args[i]; + const char *fmt = parg->type->fmttype; + int size = parg->type->size; + + if (parg->fmt) + fmt = parg->fmt; + if (parg->count) + size *= parg->count; + ret = trace_define_field(event_call, fmt, parg->name, + offset + parg->offset, size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5f52668e165d..974afc1a3e73 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "trace.h" @@ -30,6 +31,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 +#define MAX_ARRAY_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -54,50 +56,74 @@ #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 +/* data_loc: data location, compatible with u32 */ +#define make_data_loc(len, offs) \ + (((u32)(len) << 16) | ((u32)(offs) & 0xffff)) +#define get_loc_len(dl) ((u32)(dl) >> 16) +#define get_loc_offs(dl) ((u32)(dl) & 0xffff) -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -static nokprobe_inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { - return (u8 *)dl + get_rloc_offs(*dl); + return (u8 *)ent + get_loc_offs(*dl); } -/* For data_loc conversion */ -static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline u32 update_data_loc(u32 loc, int consumed) { - return (u8 *)ent + get_rloc_offs(*dl); + u32 maxlen = get_loc_len(loc); + u32 offset = get_loc_offs(loc); + + return make_data_loc(maxlen - consumed, offset + consumed); } -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_comm, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_file_offset, - FETCH_MTD_END, +typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); + +enum fetch_op { + FETCH_OP_NOP = 0, + // Stage 1 (load) ops + FETCH_OP_REG, /* Register : .param = offset */ + FETCH_OP_STACK, /* Stack : .param = index */ + FETCH_OP_STACKP, /* Stack pointer */ + FETCH_OP_RETVAL, /* Return value */ + FETCH_OP_IMM, /* Immediate : .immediate */ + FETCH_OP_COMM, /* Current comm */ + FETCH_OP_ARG, /* Function argument : .param */ + FETCH_OP_FOFFS, /* File offset: .immediate */ + // Stage 2 (dereference) op + FETCH_OP_DEREF, /* Dereference: .offset */ + // Stage 3 (store) ops + FETCH_OP_ST_RAW, /* Raw: .size */ + FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_STRING, /* String: .offset, .size */ + // Stage 4 (modify) op + FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + // Stage 5 (loop) op + FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_END, + FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; +struct fetch_insn { + enum fetch_op op; + union { + unsigned int param; + struct { + unsigned int size; + int offset; + }; + struct { + unsigned char basesize; + unsigned char lshift; + unsigned char rshift; + }; + unsigned long immediate; + void *data; + }; +}; + +/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ +#define FETCH_INSN_MAX 16 + /* Fetch type information table */ struct fetch_type { const char *name; /* Name of type */ @@ -106,13 +132,6 @@ struct fetch_type { print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -}; - -struct fetch_param { - fetch_func_t fn; - void *data; }; /* For defining macros, define string/string_size types */ @@ -124,8 +143,7 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -142,57 +160,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x32); DECLARE_BASIC_PRINT_TYPE_FUNC(x64); DECLARE_BASIC_PRINT_TYPE_FUNC(string); - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type - -/* Declare macro for basic types */ -#define DECLARE_FETCH_FUNC(method, type) \ -extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ - void *data, void *dest) - -#define DECLARE_BASIC_FETCH_FUNCS(method) \ -DECLARE_FETCH_FUNC(method, u8); \ -DECLARE_FETCH_FUNC(method, u16); \ -DECLARE_FETCH_FUNC(method, u32); \ -DECLARE_FETCH_FUNC(method, u64) - -DECLARE_BASIC_FETCH_FUNCS(reg); -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(retval); -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(symbol); -DECLARE_FETCH_FUNC(symbol, string); -DECLARE_FETCH_FUNC(symbol, string_size); - -DECLARE_BASIC_FETCH_FUNCS(deref); -DECLARE_FETCH_FUNC(deref, string); -DECLARE_FETCH_FUNC(deref, string_size); - -DECLARE_BASIC_FETCH_FUNCS(bitfield); -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -/* comm only makes sense as a string */ -#define fetch_comm_u8 NULL -#define fetch_comm_u16 NULL -#define fetch_comm_u32 NULL -#define fetch_comm_u64 NULL -DECLARE_FETCH_FUNC(comm, string); -DECLARE_FETCH_FUNC(comm, string_size); - -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) x##t @@ -200,8 +168,9 @@ DEFINE_FETCH_##method(u64) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) +#define __ADDR_FETCH_TYPE(t) u##t +#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) +#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ {.name = _name, \ @@ -210,64 +179,23 @@ DEFINE_FETCH_##method(u64) .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(comm, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ -ASSIGN_FETCH_FUNC(file_offset, ftype), \ - } \ } - +#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) /* If ptype is an alias of atype, use this macro (show atype in format) */ #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) #define ASSIGN_FETCH_TYPE_END {} - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +#define MAX_ARRAY_LEN 64 #ifdef CONFIG_KPROBE_EVENTS -struct symbol_cache; -unsigned long update_symbol_cache(struct symbol_cache *sc); -void free_symbol_cache(struct symbol_cache *sc); -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); bool trace_kprobe_on_func_entry(struct trace_event_call *call); bool trace_kprobe_error_injectable(struct trace_event_call *call); #else -/* uprobes do not support symbol fetch methods */ -#define fetch_symbol_u8 NULL -#define fetch_symbol_u16 NULL -#define fetch_symbol_u32 NULL -#define fetch_symbol_u64 NULL -#define fetch_symbol_string NULL -#define fetch_symbol_string_size NULL - -struct symbol_cache { -}; -static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) -{ - return 0; -} - -static inline void __used free_symbol_cache(struct symbol_cache *sc) -{ -} - -static inline struct symbol_cache * __used -alloc_symbol_cache(const char *sym, long offset) -{ - return NULL; -} - static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { return false; @@ -280,11 +208,13 @@ static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; + struct fetch_insn *code; + bool dynamic;/* Dynamic array (string) is used */ unsigned int offset; /* Offset from argument entry */ + unsigned int count; /* Array count */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ + char *fmt; /* Format string if needed */ const struct fetch_type *type; /* Type of this argument */ }; @@ -313,12 +243,6 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static nokprobe_inline void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -343,67 +267,23 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) return NULL; } +#define TPARG_FL_RETURN BIT(0) +#define TPARG_FL_KERNEL BIT(1) +#define TPARG_FL_FENTRY BIT(2) +#define TPARG_FL_MASK GENMASK(2, 0) + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl); + struct probe_arg *parg, unsigned int flags); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); -extern void traceprobe_update_arg(struct probe_arg *arg); +extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -/* Sum up total data length for dynamic arraies (strings) */ -static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static nokprobe_inline void -store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - -extern int set_print_fmt(struct trace_probe *tp, bool is_return); +extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * @@ -412,6 +292,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); extern struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif +extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp); diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h new file mode 100644 index 000000000000..5c56afc17cf8 --- /dev/null +++ b/kernel/trace/trace_probe_tmpl.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Traceprobe fetch helper inlines + */ + +static nokprobe_inline void +fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf) +{ + switch (code->size) { + case 1: + *(u8 *)buf = (u8)val; + break; + case 2: + *(u16 *)buf = (u16)val; + break; + case 4: + *(u32 *)buf = (u32)val; + break; + case 8: + //TBD: 32bit signed + *(u64 *)buf = (u64)val; + break; + default: + *(unsigned long *)buf = val; + } +} + +static nokprobe_inline void +fetch_apply_bitfield(struct fetch_insn *code, void *buf) +{ + switch (code->basesize) { + case 1: + *(u8 *)buf <<= code->lshift; + *(u8 *)buf >>= code->rshift; + break; + case 2: + *(u16 *)buf <<= code->lshift; + *(u16 *)buf >>= code->rshift; + break; + case 4: + *(u32 *)buf <<= code->lshift; + *(u32 *)buf >>= code->rshift; + break; + case 8: + *(u64 *)buf <<= code->lshift; + *(u64 *)buf >>= code->rshift; + break; + } +} + +/* + * These functions must be defined for each callsite. + * Return consumed dynamic data size (>= 0), or error (< 0). + * If dest is NULL, don't store result and return required dynamic data size. + */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, + void *dest, void *base); +static nokprobe_inline int fetch_store_strlen(unsigned long addr); +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size); + +/* From the 2nd stage, routine is same */ +static nokprobe_inline int +process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, + void *dest, void *base) +{ + struct fetch_insn *s3 = NULL; + int total = 0, ret = 0, i = 0; + u32 loc = 0; + unsigned long lval = val; + +stage2: + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + s3 = code; +stage3: + /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) { + ret += fetch_store_strlen(val + code->offset); + code++; + goto array; + } else + return -EILSEQ; + } + + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_mem_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + loc = *(u32 *)dest; + ret = fetch_store_string(val + code->offset, dest, base); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + +array: + /* the last stage: Loop on array */ + if (code->op == FETCH_OP_LP_ARRAY) { + total += ret; + if (++i < code->param) { + code = s3; + if (s3->op != FETCH_OP_ST_STRING) { + dest += s3->size; + val += s3->size; + goto stage3; + } + code--; + val = lval + sizeof(char *); + if (dest) { + dest += sizeof(u32); + *(u32 *)dest = update_data_loc(loc, ret); + } + goto stage2; + } + code++; + ret = total; + } + + return code->op == FETCH_OP_END ? ret : -EILSEQ; +} + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + len = process_fetch_insn(arg->code, regs, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, + int header_size, int maxlen) +{ + struct probe_arg *arg; + void *base = data - header_size; + void *dyndata = data + tp->size; + u32 *dl; /* Data location */ + int ret, i; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + dl = data + arg->offset; + /* Point the dynamic data area if needed */ + if (unlikely(arg->dynamic)) + *dl = make_data_loc(maxlen, dyndata - base); + ret = process_fetch_insn(arg->code, regs, dl, base); + if (unlikely(ret < 0 && arg->dynamic)) + *dl = make_data_loc(0, dyndata - base); + else + dyndata += ret; + } +} + +static inline int +print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + void *p; + int i, j; + + for (i = 0; i < nr_args; i++) { + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } + } + return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4237eba4ef20..2b0d1ee3241c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 3; + stack_trace_max.skip = 0; save_stack_trace(&stack_trace_max); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e696667da29a..31ea48eceda1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -15,6 +15,7 @@ #include #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define UPROBE_EVENT_SYSTEM "uprobes" @@ -47,6 +48,7 @@ struct trace_uprobe { struct inode *inode; char *filename; unsigned long offset; + unsigned long ref_ctr_offset; unsigned long nhit; struct trace_probe tp; }; @@ -98,74 +100,52 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) /* * Uprobes-specific fetch functions */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)get_user_stack_nth(regs, \ - ((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - void __user *vaddr = (void __force __user *) addr; \ - \ - if (copy_from_user(&retval, vaddr, sizeof(type))) \ - *(type *)dest = 0; \ - else \ - *(type *) dest = retval; \ +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + void __user *vaddr = (void __force __user *)src; + + return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } -DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) { long ret; - u32 rloc = *(u32 *)dest; - int maxlen = get_rloc_len(rloc); - u8 *dst = get_rloc_data(dest); + u32 loc = *(u32 *)dest; + int maxlen = get_loc_len(loc); + u8 *dst = get_loc_data(dest, base); void __user *src = (void __force __user *) addr; - if (!maxlen) - return; + if (unlikely(!maxlen)) + return -ENOMEM; ret = strncpy_from_user(dst, src, maxlen); - if (ret == maxlen) - dst[--ret] = '\0'; - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + if (ret >= 0) { + if (ret == maxlen) + dst[ret - 1] = '\0'; + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } + + return ret; } -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) { int len; void __user *vaddr = (void __force __user *) addr; len = strnlen_user(vaddr, MAX_STRING_SIZE); - if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; + return (len > MAX_STRING_SIZE) ? 0 : len; } -static unsigned long translate_user_vaddr(void *file_offset) +static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; struct uprobe_dispatch_data *udd; @@ -173,44 +153,44 @@ static unsigned long translate_user_vaddr(void *file_offset) udd = (void *) current->utask->vaddr; base_addr = udd->bp_addr - udd->tu->offset; - return base_addr + (unsigned long)file_offset; + return base_addr + file_offset; } -#define DEFINE_FETCH_file_offset(type) \ -static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ - void *offset, void *dest)\ -{ \ - void *vaddr = (void *)translate_user_vaddr(offset); \ - \ - FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = get_user_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = user_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_FOFFS: + val = translate_user_vaddr(code->immediate); + break; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); } -DEFINE_BASIC_FETCH_FUNCS(file_offset) -DEFINE_FETCH_file_offset(string) -DEFINE_FETCH_file_offset(string_size) - -/* Fetch type information table */ -static const struct fetch_type uprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; +NOKPROBE_SYMBOL(process_fetch_insn) static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -311,6 +291,35 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) return 0; } +/* + * Uprobe with multiple reference counter is not allowed. i.e. + * If inode and offset matches, reference counter offset *must* + * match as well. Though, there is one exception: If user is + * replacing old trace_uprobe with new one(same group/event), + * then we allow same uprobe with new reference counter as far + * as the new one does not conflict with any other existing + * ones. + */ +static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +{ + struct trace_uprobe *tmp, *old = NULL; + struct inode *new_inode = d_real_inode(new->path.dentry); + + old = find_probe_event(trace_event_name(&new->tp.call), + new->tp.call.class->system); + + list_for_each_entry(tmp, &uprobe_list, list) { + if ((old ? old != tmp : true) && + new_inode == d_real_inode(tmp->path.dentry) && + new->offset == tmp->offset && + new->ref_ctr_offset != tmp->ref_ctr_offset) { + pr_warn("Reference counter offset mismatch."); + return ERR_PTR(-EINVAL); + } + } + return old; +} + /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { @@ -320,8 +329,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(trace_event_name(&tu->tp.call), - tu->tp.call.class->system); + old_tu = find_old_trace_uprobe(tu); + if (IS_ERR(old_tu)) { + ret = PTR_ERR(old_tu); + goto end; + } + if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -352,10 +365,10 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename; + char *arg, *event, *group, *filename, *rctr, *rctr_end; char buf[MAX_EVENT_NAME_LEN]; struct path path; - unsigned long offset; + unsigned long offset, ref_ctr_offset; bool is_delete, is_return; int i, ret; @@ -364,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv) is_return = false; event = NULL; group = NULL; + ref_ctr_offset = 0; /* argc must be >= 1 */ if (argv[0][0] == '-') @@ -438,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } + /* Parse reference counter offset if specified. */ + rctr = strchr(arg, '('); + if (rctr) { + rctr_end = strchr(rctr, ')'); + if (rctr > rctr_end || *(rctr_end + 1) != 0) { + ret = -EINVAL; + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + + *rctr++ = '\0'; + *rctr_end = '\0'; + ret = kstrtoul(rctr, 0, &ref_ctr_offset); + if (ret) { + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + } + + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; @@ -472,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; + tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); @@ -522,8 +557,7 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false, - uprobes_fetch_type_table); + is_return ? TPARG_FL_RETURN : 0); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -590,6 +624,9 @@ static int probes_seq_show(struct seq_file *m, void *v) trace_event_name(&tu->tp.call), tu->filename, (int)(sizeof(void *) * 2), tu->offset); + if (tu->ref_ctr_offset) + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); + for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -833,7 +870,6 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; - int i; entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, tp.call.event); @@ -850,12 +886,8 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto out; - } + if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -905,7 +937,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (tu->ref_ctr_offset) { + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + } else { + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + } + if (ret) goto err_buffer; @@ -958,7 +996,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) static int uprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i, size; + int ret, size; struct uprobe_trace_entry_head field; struct trace_uprobe *tu = event_call->data; @@ -970,19 +1008,8 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); size = SIZEOF_TRACE_ENTRY(false); } - /* Set argument names as fields */ - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, size + parg->offset, - parg->type->size, parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, size, &tu->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1233,7 +1260,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, ucb, dsize); @@ -1268,7 +1295,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, ucb, dsize); @@ -1304,7 +1331,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) init_trace_event_call(tu, call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1340,7 +1367,8 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) #ifdef CONFIG_PERF_EVENTS struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return) { struct trace_uprobe *tu; struct path path; @@ -1372,10 +1400,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) tu->offset = offs; tu->path = path; + tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index bf2c06ef9afc..a3be42304485 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -28,8 +28,8 @@ #include #include -extern struct tracepoint * const __start___tracepoints_ptrs[]; -extern struct tracepoint * const __stop___tracepoints_ptrs[]; +extern tracepoint_ptr_t __start___tracepoints_ptrs[]; +extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; DEFINE_SRCU(tracepoint_srcu); EXPORT_SYMBOL_GPL(tracepoint_srcu); @@ -371,25 +371,17 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, +static void for_each_tracepoint_range( + tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { + tracepoint_ptr_t *iter; + if (!begin) return; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { - const int *iter; - - for (iter = (const int *)begin; iter < (const int *)end; iter++) - fct(offset_to_ptr(iter), priv); - } else { - struct tracepoint * const *iter; - - for (iter = begin; iter < end; iter++) - fct(*iter, priv); - } + for (iter = begin; iter < end; iter++) + fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES diff --git a/kernel/umh.c b/kernel/umh.c index c449858946af..0baa672e023c 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -405,11 +405,19 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; + struct umh_info *info = data; + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); if (!sub_info) return NULL; + sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!sub_info->argv) { + kfree(sub_info); + return NULL; + } + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = "none"; sub_info->file = file; @@ -458,10 +466,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return 0; } -static void umh_save_pid(struct subprocess_info *info) +static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + argv_free(info->argv); umh_info->pid = info->pid; } @@ -471,6 +480,9 @@ static void umh_save_pid(struct subprocess_info *info) * @len: length of the blob * @info: information about usermode process (shouldn't be NULL) * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such * case 'struct umh_info *info' is populated with two pipes @@ -500,7 +512,7 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) err = -ENOMEM; sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_save_pid, info); + umh_clean_and_save_pid, info); if (!sub_info) goto out; diff --git a/kernel/up.c b/kernel/up.c index 42c46bf3e0a5..ff536f9cc8a2 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask) { unsigned long flags; @@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), } preempt_enable(); } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); +} EXPORT_SYMBOL(on_each_cpu_cond); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e5222b5fb4fe..923414a246e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; - ret = sort_idmaps(&new_map); - if (ret < 0) - goto out; - ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. @@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf, e->lower_first = lower_first; } + /* + * If we want to use binary search for lookup, this clones the extent + * array and sorts both copies. + */ + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, diff --git a/lib/Kconfig b/lib/Kconfig index a3928d4438b5..a9965f4af4dd 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -399,8 +399,11 @@ config INTERVAL_TREE for more information. -config RADIX_TREE_MULTIORDER +config XARRAY_MULTI bool + help + Support entries which occupy multiple consecutive indices in the + XArray. config ASSOCIATIVE_ARRAY bool diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4966c4fbe7f7..1af29b8224fd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1179,7 +1179,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !ARM && !S390 && !MICROBLAZE && !ARC && !X86 select KALLSYMS select KALLSYMS_ALL @@ -1292,7 +1292,7 @@ config DEBUG_KOBJECT depends on DEBUG_KERNEL help If you say Y here, some extra kobject debugging messages will be sent - to the syslog. + to the syslog. config DEBUG_KOBJECT_RELEASE bool "kobject release debugging" @@ -1590,7 +1590,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86 help Provide stacktrace filter for fault-injection capabilities @@ -1599,7 +1599,7 @@ config LATENCYTOP depends on DEBUG_KERNEL depends on STACKTRACE_SUPPORT depends on PROC_FS - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86 select KALLSYMS select KALLSYMS_ALL select STACKTRACE @@ -1813,6 +1813,9 @@ config TEST_BITFIELD config TEST_UUID tristate "Test functions located in the uuid module at runtime" +config TEST_XARRAY + tristate "Test the XArray code at runtime" + config TEST_OVERFLOW tristate "Test check_*_overflow() functions at runtime" @@ -1965,11 +1968,18 @@ config TEST_DEBUG_VIRTUAL If unsure, say N. +config TEST_MEMCAT_P + tristate "Test memcat_p() helper function" + help + Test the memcat_p() helper for correctly merging two + pointer arrays together. + + If unsure, say N. + endif # RUNTIME_TESTING_MENU config MEMTEST bool "Memtest" - depends on HAVE_MEMBLOCK ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index befb127507c0..d0bad1bd9a2b 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -57,6 +57,15 @@ config KASAN_INLINE endchoice +config KASAN_S390_4_LEVEL_PAGING + bool "KASan: use 4-level paging" + depends on KASAN && S390 + help + Compiling the kernel with KASan disables automatic 3-level vs + 4-level paging selection. 3-level paging is used by default (up + to 3TB of RAM with KASan enabled). This options allows to force + 4-level paging instead. + config TEST_KASAN tristate "Module for testing kasan for bug detection" depends on m && KASAN diff --git a/lib/Makefile b/lib/Makefile index ca3f7ebb900d..db06d1237898 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -18,13 +18,13 @@ KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o timerqueue.o\ + rbtree.o radix-tree.o timerqueue.o xarray.o \ idr.o int_sqrt.o extable.o \ sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o nodemask.o win_minmax.o + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o @@ -53,7 +53,9 @@ obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin +CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o +CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o @@ -68,9 +70,11 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o obj-$(CONFIG_TEST_UUID) += test_uuid.o +obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o +obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -119,7 +123,6 @@ obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o -CFLAGS_bch.o := $(call cc-option,-Wframe-larger-than=4500) obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ obj-$(CONFIG_LZ4_COMPRESS) += lz4/ diff --git a/lib/bch.c b/lib/bch.c index 7b0f2006698b..5db6d3a4c8a6 100644 --- a/lib/bch.c +++ b/lib/bch.c @@ -79,20 +79,19 @@ #define GF_T(_p) (CONFIG_BCH_CONST_T) #define GF_N(_p) ((1 << (CONFIG_BCH_CONST_M))-1) #define BCH_MAX_M (CONFIG_BCH_CONST_M) +#define BCH_MAX_T (CONFIG_BCH_CONST_T) #else #define GF_M(_p) ((_p)->m) #define GF_T(_p) ((_p)->t) #define GF_N(_p) ((_p)->n) -#define BCH_MAX_M 15 +#define BCH_MAX_M 15 /* 2KB */ +#define BCH_MAX_T 64 /* 64 bit correction */ #endif -#define BCH_MAX_T (((1 << BCH_MAX_M) - 1) / BCH_MAX_M) - #define BCH_ECC_WORDS(_p) DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32) #define BCH_ECC_BYTES(_p) DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8) #define BCH_ECC_MAX_WORDS DIV_ROUND_UP(BCH_MAX_M * BCH_MAX_T, 32) -#define BCH_ECC_MAX_BYTES DIV_ROUND_UP(BCH_MAX_M * BCH_MAX_T, 8) #ifndef dbg #define dbg(_fmt, args...) do {} while (0) @@ -202,6 +201,9 @@ void encode_bch(struct bch_control *bch, const uint8_t *data, const uint32_t * const tab3 = tab2 + 256*(l+1); const uint32_t *pdata, *p0, *p1, *p2, *p3; + if (WARN_ON(r_bytes > sizeof(r))) + return; + if (ecc) { /* load ecc parity bytes into internal 32-bit buffer */ load_ecc8(bch, bch->ecc_buf, ecc); @@ -1285,6 +1287,13 @@ struct bch_control *init_bch(int m, int t, unsigned int prim_poly) */ goto fail; + if (t > BCH_MAX_T) + /* + * we can support larger than 64 bits if necessary, at the + * cost of higher stack usage. + */ + goto fail; + /* sanity checks */ if ((t < 1) || (m*t >= ((1 << m)-1))) /* invalid t value */ diff --git a/lib/bitmap.c b/lib/bitmap.c index 2fd07f6df0b8..eead55aa7170 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -36,11 +37,6 @@ * carefully filter out these unused bits from impacting their * results. * - * These operations actually hold to a slightly stronger rule: - * if you don't input any bitmaps to these ops that have some - * unused bits set, then they won't output any set unused bits - * in output bitmaps. - * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h @@ -466,20 +462,18 @@ EXPORT_SYMBOL(bitmap_parse_user); * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. * - * It is assumed that @buf is a pointer into a PAGE_SIZE area and that - * sufficient storage remains at @buf to accommodate the - * bitmap_print_to_pagebuf() output. + * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned + * area and that sufficient storage remains at @buf to accommodate the + * bitmap_print_to_pagebuf() output. Returns the number of characters + * actually printed to @buf, excluding terminating '\0'. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; - int n = 0; + ptrdiff_t len = PAGE_SIZE - offset_in_page(buf); - if (len > 1) - n = list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : - scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); - return n; + return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : + scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); } EXPORT_SYMBOL(bitmap_print_to_pagebuf); diff --git a/lib/chacha20.c b/lib/chacha20.c index c1cc50fb68c9..d907fec6a9ed 100644 --- a/lib/chacha20.c +++ b/lib/chacha20.c @@ -16,9 +16,9 @@ #include #include -void chacha20_block(u32 *state, u32 *stream) +void chacha20_block(u32 *state, u8 *stream) { - u32 x[16], *out = stream; + u32 x[16]; int i; for (i = 0; i < ARRAY_SIZE(x); i++) @@ -67,7 +67,7 @@ void chacha20_block(u32 *state, u32 *stream) } for (i = 0; i < ARRAY_SIZE(x); i++) - out[i] = cpu_to_le32(x[i] + state[i]); + put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; } diff --git a/lib/cpumask.c b/lib/cpumask.c index beca6244671a..8d666ab84b5c 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include /** * cpumask_next - get the next cpu in a cpumask @@ -163,7 +163,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_virt_alloc(cpumask_size(), 0); + *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); } /** diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c index 1ad33e555805..4d0d47c1ffbd 100644 --- a/lib/crc-t10dif.c +++ b/lib/crc-t10dif.c @@ -14,10 +14,47 @@ #include #include #include +#include #include +#include -static struct crypto_shash *crct10dif_tfm; +static struct crypto_shash __rcu *crct10dif_tfm; static struct static_key crct10dif_fallback __read_mostly; +static DEFINE_MUTEX(crc_t10dif_mutex); + +static int crc_t10dif_rehash(struct notifier_block *self, unsigned long val, void *data) +{ + struct crypto_alg *alg = data; + struct crypto_shash *new, *old; + + if (val != CRYPTO_MSG_ALG_LOADED || + static_key_false(&crct10dif_fallback) || + strncmp(alg->cra_name, CRC_T10DIF_STRING, strlen(CRC_T10DIF_STRING))) + return 0; + + mutex_lock(&crc_t10dif_mutex); + old = rcu_dereference_protected(crct10dif_tfm, + lockdep_is_held(&crc_t10dif_mutex)); + if (!old) { + mutex_unlock(&crc_t10dif_mutex); + return 0; + } + new = crypto_alloc_shash("crct10dif", 0, 0); + if (IS_ERR(new)) { + mutex_unlock(&crc_t10dif_mutex); + return 0; + } + rcu_assign_pointer(crct10dif_tfm, new); + mutex_unlock(&crc_t10dif_mutex); + + synchronize_rcu(); + crypto_free_shash(old); + return 0; +} + +static struct notifier_block crc_t10dif_nb = { + .notifier_call = crc_t10dif_rehash, +}; __u16 crc_t10dif_update(__u16 crc, const unsigned char *buffer, size_t len) { @@ -30,11 +67,14 @@ __u16 crc_t10dif_update(__u16 crc, const unsigned char *buffer, size_t len) if (static_key_false(&crct10dif_fallback)) return crc_t10dif_generic(crc, buffer, len); - desc.shash.tfm = crct10dif_tfm; + rcu_read_lock(); + desc.shash.tfm = rcu_dereference(crct10dif_tfm); desc.shash.flags = 0; *(__u16 *)desc.ctx = crc; err = crypto_shash_update(&desc.shash, buffer, len); + rcu_read_unlock(); + BUG_ON(err); return *(__u16 *)desc.ctx; @@ -49,6 +89,7 @@ EXPORT_SYMBOL(crc_t10dif); static int __init crc_t10dif_mod_init(void) { + crypto_register_notifier(&crc_t10dif_nb); crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0); if (IS_ERR(crct10dif_tfm)) { static_key_slow_inc(&crct10dif_fallback); @@ -59,12 +100,24 @@ static int __init crc_t10dif_mod_init(void) static void __exit crc_t10dif_mod_fini(void) { + crypto_unregister_notifier(&crc_t10dif_nb); crypto_free_shash(crct10dif_tfm); } module_init(crc_t10dif_mod_init); module_exit(crc_t10dif_mod_fini); +static int crc_t10dif_transform_show(char *buffer, const struct kernel_param *kp) +{ + if (static_key_false(&crct10dif_fallback)) + return sprintf(buffer, "fallback\n"); + + return sprintf(buffer, "%s\n", + crypto_tfm_alg_driver_name(crypto_shash_tfm(crct10dif_tfm))); +} + +module_param_call(transform, NULL, crc_t10dif_transform_show, NULL, 0644); + MODULE_DESCRIPTION("T10 DIF CRC calculation"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crct10dif"); diff --git a/lib/crc32.c b/lib/crc32.c index a6c9afafc8c8..45b1d67a1767 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -183,21 +183,21 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, } #if CRC_LE_BITS == 1 -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); } -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); } #else -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, (const u32 (*)[256])crc32table_le, CRC32_POLY_LE); } -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, (const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE); @@ -206,6 +206,9 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); +u32 crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); +u32 __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); + /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit diff --git a/lib/debug_locks.c b/lib/debug_locks.c index 96c4c633d95e..ce51749cc145 100644 --- a/lib/debug_locks.c +++ b/lib/debug_locks.c @@ -21,7 +21,7 @@ * that would just muddy the log. So we report the first one and * shut up after that. */ -int debug_locks = 1; +int debug_locks __read_mostly = 1; EXPORT_SYMBOL_GPL(debug_locks); /* @@ -29,7 +29,7 @@ EXPORT_SYMBOL_GPL(debug_locks); * 'silent failure': nothing is printed to the console when * a locking bug is detected. */ -int debug_locks_silent; +int debug_locks_silent __read_mostly; EXPORT_SYMBOL_GPL(debug_locks_silent); /* @@ -37,7 +37,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent); */ int debug_locks_off(void) { - if (__debug_locks_off()) { + if (debug_locks && __debug_locks_off()) { if (!debug_locks_silent) { console_verbose(); return 1; diff --git a/lib/idr.c b/lib/idr.c index fab2fd5bc326..cb1db9b8d3f6 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -6,8 +6,6 @@ #include #include -DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap); - /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. @@ -39,10 +37,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned int base = idr->idr_base; unsigned int id = *nextid; - if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) - return -EINVAL; - if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR))) - idr->idr_rt.gfp_mask |= IDR_RT_MARKER; + if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) + idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); @@ -295,15 +291,13 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) void __rcu **slot = NULL; void *entry; - if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) - return ERR_PTR(-EINVAL); id -= idr->idr_base; entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); - __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL); + __radix_tree_replace(&idr->idr_rt, node, slot, ptr); return entry; } @@ -324,6 +318,9 @@ EXPORT_SYMBOL(idr_replace); * free the individual IDs in it. You can use ida_is_empty() to find * out whether the IDA has any IDs currently allocated. * + * The IDA handles its own locking. It is safe to call any of the IDA + * functions without synchronisation in your code. + * * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward * limitation, it should be quite straightforward to raise the maximum. */ @@ -331,161 +328,197 @@ EXPORT_SYMBOL(idr_replace); /* * Developer's notes: * - * The IDA uses the functionality provided by the IDR & radix tree to store - * bitmaps in each entry. The IDR_FREE tag means there is at least one bit - * free, unlike the IDR where it means at least one entry is free. + * The IDA uses the functionality provided by the XArray to store bitmaps in + * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap + * have been set. * - * I considered telling the radix tree that each slot is an order-10 node - * and storing the bit numbers in the radix tree, but the radix tree can't - * allow a single multiorder entry at index 0, which would significantly - * increase memory consumption for the IDA. So instead we divide the index - * by the number of bits in the leaf bitmap before doing a radix tree lookup. + * I considered telling the XArray that each slot is an order-10 node + * and indexing by bit number, but the XArray can't allow a single multi-index + * entry in the head, which would significantly increase memory consumption + * for the IDA. So instead we divide the index by the number of bits in the + * leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given - * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional - * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits - * directly in the entry. By being really tricksy, we could store - * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising - * for 0-3 allocated IDs. - * - * We allow the radix tree 'exceptional' count to get out of date. Nothing - * in the IDA nor the radix tree code checks it. If it becomes important - * to maintain an accurate exceptional count, switch the rcu_assign_pointer() - * calls to radix_tree_iter_replace() which will correct the exceptional - * count. - * - * The IDA always requires a lock to alloc/free. If we add a 'test_bit' + * leaf, instead of allocating a 128-byte bitmap, we store the bits + * as a value entry. Value entries never have the XA_FREE_MARK cleared + * because we can always convert them into a bitmap entry. + * + * It would be possible to optimise further; once we've run out of a + * single 128-byte bitmap, we currently switch to a 576-byte node, put + * the 128-byte bitmap in the first entry and then start allocating extra + * 128-byte entries. We could instead use the 512 bytes of the node's + * data as a bitmap before moving to that scheme. I do not believe this + * is a worthwhile optimisation; Rasmus Villemoes surveyed the current + * users of the IDA and almost none of them use more than 1024 entries. + * Those that do use more than the 8192 IDs that the 512 bytes would + * provide. + * + * The IDA always uses a lock to alloc/free. If we add a 'test_bit' * equivalent, it will still need locking. Going to RCU lookup would require * using RCU to free bitmaps, and that's not trivial without embedding an * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte * bitmap, which is excessive. */ -#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS - 1) - -static int ida_get_new_above(struct ida *ida, int start) +/** + * ida_alloc_range() - Allocate an unused ID. + * @ida: IDA handle. + * @min: Lowest ID to allocate. + * @max: Highest ID to allocate. + * @gfp: Memory allocation flags. + * + * Allocate an ID between @min and @max, inclusive. The allocated ID will + * not exceed %INT_MAX, even if @max is larger. + * + * Context: Any context. + * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, + * or %-ENOSPC if there are no free IDs. + */ +int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, + gfp_t gfp) { - struct radix_tree_root *root = &ida->ida_rt; - void __rcu **slot; - struct radix_tree_iter iter; - struct ida_bitmap *bitmap; - unsigned long index; - unsigned bit, ebit; - int new; - - index = start / IDA_BITMAP_BITS; - bit = start % IDA_BITMAP_BITS; - ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT; - - slot = radix_tree_iter_init(&iter, index); - for (;;) { - if (slot) - slot = radix_tree_next_slot(slot, &iter, - RADIX_TREE_ITER_TAGGED); - if (!slot) { - slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX); - if (IS_ERR(slot)) { - if (slot == ERR_PTR(-ENOMEM)) - return -EAGAIN; - return PTR_ERR(slot); + XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS); + unsigned bit = min % IDA_BITMAP_BITS; + unsigned long flags; + struct ida_bitmap *bitmap, *alloc = NULL; + + if ((int)min < 0) + return -ENOSPC; + + if ((int)max < 0) + max = INT_MAX; + +retry: + xas_lock_irqsave(&xas, flags); +next: + bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK); + if (xas.xa_index > min / IDA_BITMAP_BITS) + bit = 0; + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + + if (xa_is_value(bitmap)) { + unsigned long tmp = xa_to_value(bitmap); + + if (bit < BITS_PER_XA_VALUE) { + bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit); + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + if (bit < BITS_PER_XA_VALUE) { + tmp |= 1UL << bit; + xas_store(&xas, xa_mk_value(tmp)); + goto out; } } - if (iter.index > index) { - bit = 0; - ebit = RADIX_TREE_EXCEPTIONAL_SHIFT; - } - new = iter.index * IDA_BITMAP_BITS; - bitmap = rcu_dereference_raw(*slot); - if (radix_tree_exception(bitmap)) { - unsigned long tmp = (unsigned long)bitmap; - ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit); - if (ebit < BITS_PER_LONG) { - tmp |= 1UL << ebit; - rcu_assign_pointer(*slot, (void *)tmp); - return new + ebit - - RADIX_TREE_EXCEPTIONAL_SHIFT; - } - bitmap = this_cpu_xchg(ida_bitmap, NULL); - if (!bitmap) - return -EAGAIN; - bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT; - rcu_assign_pointer(*slot, bitmap); + bitmap = alloc; + if (!bitmap) + bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); + if (!bitmap) + goto alloc; + bitmap->bitmap[0] = tmp; + xas_store(&xas, bitmap); + if (xas_error(&xas)) { + bitmap->bitmap[0] = 0; + goto out; } + } - if (bitmap) { - bit = find_next_zero_bit(bitmap->bitmap, - IDA_BITMAP_BITS, bit); - new += bit; - if (new < 0) - return -ENOSPC; - if (bit == IDA_BITMAP_BITS) - continue; + if (bitmap) { + bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + if (bit == IDA_BITMAP_BITS) + goto next; - __set_bit(bit, bitmap->bitmap); - if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) - radix_tree_iter_tag_clear(root, &iter, - IDR_FREE); + __set_bit(bit, bitmap->bitmap); + if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) + xas_clear_mark(&xas, XA_FREE_MARK); + } else { + if (bit < BITS_PER_XA_VALUE) { + bitmap = xa_mk_value(1UL << bit); } else { - new += bit; - if (new < 0) - return -ENOSPC; - if (ebit < BITS_PER_LONG) { - bitmap = (void *)((1UL << ebit) | - RADIX_TREE_EXCEPTIONAL_ENTRY); - radix_tree_iter_replace(root, &iter, slot, - bitmap); - return new; - } - bitmap = this_cpu_xchg(ida_bitmap, NULL); + bitmap = alloc; if (!bitmap) - return -EAGAIN; + bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); + if (!bitmap) + goto alloc; __set_bit(bit, bitmap->bitmap); - radix_tree_iter_replace(root, &iter, slot, bitmap); } - - return new; + xas_store(&xas, bitmap); + } +out: + xas_unlock_irqrestore(&xas, flags); + if (xas_nomem(&xas, gfp)) { + xas.xa_index = min / IDA_BITMAP_BITS; + bit = min % IDA_BITMAP_BITS; + goto retry; } + if (bitmap != alloc) + kfree(alloc); + if (xas_error(&xas)) + return xas_error(&xas); + return xas.xa_index * IDA_BITMAP_BITS + bit; +alloc: + xas_unlock_irqrestore(&xas, flags); + alloc = kzalloc(sizeof(*bitmap), gfp); + if (!alloc) + return -ENOMEM; + xas_set(&xas, min / IDA_BITMAP_BITS); + bit = min % IDA_BITMAP_BITS; + goto retry; +nospc: + xas_unlock_irqrestore(&xas, flags); + return -ENOSPC; } +EXPORT_SYMBOL(ida_alloc_range); -static void ida_remove(struct ida *ida, int id) +/** + * ida_free() - Release an allocated ID. + * @ida: IDA handle. + * @id: Previously allocated ID. + * + * Context: Any context. + */ +void ida_free(struct ida *ida, unsigned int id) { - unsigned long index = id / IDA_BITMAP_BITS; - unsigned offset = id % IDA_BITMAP_BITS; + XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS); + unsigned bit = id % IDA_BITMAP_BITS; struct ida_bitmap *bitmap; - unsigned long *btmp; - struct radix_tree_iter iter; - void __rcu **slot; + unsigned long flags; - slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index); - if (!slot) - goto err; + BUG_ON((int)id < 0); + + xas_lock_irqsave(&xas, flags); + bitmap = xas_load(&xas); - bitmap = rcu_dereference_raw(*slot); - if (radix_tree_exception(bitmap)) { - btmp = (unsigned long *)slot; - offset += RADIX_TREE_EXCEPTIONAL_SHIFT; - if (offset >= BITS_PER_LONG) + if (xa_is_value(bitmap)) { + unsigned long v = xa_to_value(bitmap); + if (bit >= BITS_PER_XA_VALUE) goto err; + if (!(v & (1UL << bit))) + goto err; + v &= ~(1UL << bit); + if (!v) + goto delete; + xas_store(&xas, xa_mk_value(v)); } else { - btmp = bitmap->bitmap; - } - if (!test_bit(offset, btmp)) - goto err; - - __clear_bit(offset, btmp); - radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE); - if (radix_tree_exception(bitmap)) { - if (rcu_dereference_raw(*slot) == - (void *)RADIX_TREE_EXCEPTIONAL_ENTRY) - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); - } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) { - kfree(bitmap); - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); + if (!test_bit(bit, bitmap->bitmap)) + goto err; + __clear_bit(bit, bitmap->bitmap); + xas_set_mark(&xas, XA_FREE_MARK); + if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { + kfree(bitmap); +delete: + xas_store(&xas, NULL); + } } + xas_unlock_irqrestore(&xas, flags); return; err: + xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } +EXPORT_SYMBOL(ida_free); /** * ida_destroy() - Free all IDs. @@ -500,80 +533,60 @@ static void ida_remove(struct ida *ida, int id) */ void ida_destroy(struct ida *ida) { + XA_STATE(xas, &ida->xa, 0); + struct ida_bitmap *bitmap; unsigned long flags; - struct radix_tree_iter iter; - void __rcu **slot; - xa_lock_irqsave(&ida->ida_rt, flags); - radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) { - struct ida_bitmap *bitmap = rcu_dereference_raw(*slot); - if (!radix_tree_exception(bitmap)) + xas_lock_irqsave(&xas, flags); + xas_for_each(&xas, bitmap, ULONG_MAX) { + if (!xa_is_value(bitmap)) kfree(bitmap); - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); + xas_store(&xas, NULL); } - xa_unlock_irqrestore(&ida->ida_rt, flags); + xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(ida_destroy); -/** - * ida_alloc_range() - Allocate an unused ID. - * @ida: IDA handle. - * @min: Lowest ID to allocate. - * @max: Highest ID to allocate. - * @gfp: Memory allocation flags. - * - * Allocate an ID between @min and @max, inclusive. The allocated ID will - * not exceed %INT_MAX, even if @max is larger. - * - * Context: Any context. - * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, - * or %-ENOSPC if there are no free IDs. - */ -int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, - gfp_t gfp) -{ - int id = 0; - unsigned long flags; +#ifndef __KERNEL__ +extern void xa_dump_index(unsigned long index, unsigned int shift); +#define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS) - if ((int)min < 0) - return -ENOSPC; - - if ((int)max < 0) - max = INT_MAX; - -again: - xa_lock_irqsave(&ida->ida_rt, flags); - id = ida_get_new_above(ida, min); - if (id > (int)max) { - ida_remove(ida, id); - id = -ENOSPC; - } - xa_unlock_irqrestore(&ida->ida_rt, flags); +static void ida_dump_entry(void *entry, unsigned long index) +{ + unsigned long i; + + if (!entry) + return; + + if (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + unsigned int shift = node->shift + IDA_CHUNK_SHIFT + + XA_CHUNK_SHIFT; + + xa_dump_index(index * IDA_BITMAP_BITS, shift); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + ida_dump_entry(node->slots[i], + index | (i << node->shift)); + } else if (xa_is_value(entry)) { + xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG)); + pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry); + } else { + struct ida_bitmap *bitmap = entry; - if (unlikely(id == -EAGAIN)) { - if (!ida_pre_get(ida, gfp)) - return -ENOMEM; - goto again; + xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT); + pr_cont("bitmap: %p data", bitmap); + for (i = 0; i < IDA_BITMAP_LONGS; i++) + pr_cont(" %lx", bitmap->bitmap[i]); + pr_cont("\n"); } - - return id; } -EXPORT_SYMBOL(ida_alloc_range); -/** - * ida_free() - Release an allocated ID. - * @ida: IDA handle. - * @id: Previously allocated ID. - * - * Context: Any context. - */ -void ida_free(struct ida *ida, unsigned int id) +static void ida_dump(struct ida *ida) { - unsigned long flags; - - BUG_ON((int)id < 0); - xa_lock_irqsave(&ida->ida_rt, flags); - ida_remove(ida, id); - xa_unlock_irqrestore(&ida->ida_rt, flags); + struct xarray *xa = &ida->xa; + pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head, + xa->xa_flags >> ROOT_TAG_SHIFT); + ida_dump_entry(xa->xa_head, 0); } -EXPORT_SYMBOL(ida_free); +#endif diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8be175df3075..7ebccb5c1637 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -83,6 +83,7 @@ const struct kvec *kvec; \ struct kvec v; \ iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -114,6 +115,8 @@ } \ i->nr_segs -= kvec - i->kvec; \ i->kvec = kvec; \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ + skip += n; \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -428,17 +431,19 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) } EXPORT_SYMBOL(iov_iter_fault_in_readable); -void iov_iter_init(struct iov_iter *i, int direction, +void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { + WARN_ON(direction & ~(READ | WRITE)); + direction &= READ | WRITE; + /* It will get better. Eventually... */ if (uaccess_kernel()) { - direction |= ITER_KVEC; - i->type = direction; + i->type = ITER_KVEC | direction; i->kvec = (struct kvec *)iov; } else { - i->type = direction; + i->type = ITER_IOVEC | direction; i->iov = iov; } i->nr_segs = nr_segs; @@ -558,7 +563,7 @@ static size_t copy_pipe_to_iter(const void *addr, size_t bytes, size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -658,7 +663,7 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) const char *from = addr; unsigned long rem, curr_addr, s_addr = (unsigned long) addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter_mcsafe(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -692,7 +697,7 @@ EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -712,7 +717,7 @@ EXPORT_SYMBOL(_copy_from_iter); bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -739,7 +744,7 @@ EXPORT_SYMBOL(_copy_from_iter_full); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -773,7 +778,7 @@ EXPORT_SYMBOL(_copy_from_iter_nocache); size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -794,7 +799,7 @@ EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -836,7 +841,9 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else if (likely(!(i->type & ITER_PIPE))) + } else if (unlikely(iov_iter_is_discard(i))) + return bytes; + else if (likely(!iov_iter_is_pipe(i))) return copy_page_to_iter_iovec(page, offset, bytes, i); else return copy_page_to_iter_pipe(page, offset, bytes, i); @@ -848,7 +855,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, { if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -888,7 +895,7 @@ static size_t pipe_zero(size_t bytes, struct iov_iter *i) size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, clear_user(v.iov_base, v.iov_len), @@ -908,7 +915,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, kunmap_atomic(kaddr); return 0; } - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { kunmap_atomic(kaddr); WARN_ON(1); return 0; @@ -972,10 +979,14 @@ static void pipe_advance(struct iov_iter *i, size_t size) void iov_iter_advance(struct iov_iter *i, size_t size) { - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { pipe_advance(i, size); return; } + if (unlikely(iov_iter_is_discard(i))) { + i->count -= size; + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -987,7 +998,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; int idx = i->idx; size_t off = i->iov_offset; @@ -1011,12 +1022,14 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) pipe_truncate(i); return; } + if (unlikely(iov_iter_is_discard(i))) + return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; - if (i->type & ITER_BVEC) { + if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; @@ -1049,23 +1062,25 @@ EXPORT_SYMBOL(iov_iter_revert); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; - else if (i->type & ITER_BVEC) + if (unlikely(iov_iter_is_discard(i))) + return i->count; + else if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); else return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); -void iov_iter_kvec(struct iov_iter *i, int direction, +void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_KVEC | (direction & (READ | WRITE)); i->kvec = kvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1073,12 +1088,12 @@ void iov_iter_kvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_kvec); -void iov_iter_bvec(struct iov_iter *i, int direction, +void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_BVEC | (direction & (READ | WRITE)); i->bvec = bvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1086,13 +1101,13 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, int direction, +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count) { - BUG_ON(direction != ITER_PIPE); + BUG_ON(direction != READ); WARN_ON(pipe->nrbufs == pipe->buffers); - i->type = direction; + i->type = ITER_PIPE | READ; i->pipe = pipe; i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); i->iov_offset = 0; @@ -1101,12 +1116,30 @@ void iov_iter_pipe(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_pipe); +/** + * iov_iter_discard - Initialise an I/O iterator that discards data + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that just discards everything that's written to it. + * It's only available as a READ iterator. + */ +void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) +{ + BUG_ON(direction != READ); + i->type = ITER_DISCARD | READ; + i->count = count; + i->iov_offset = 0; +} +EXPORT_SYMBOL(iov_iter_discard); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) return size | i->iov_offset; return size; @@ -1125,7 +1158,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return ~0U; } @@ -1193,8 +1226,11 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages(i, pages, maxsize, maxpages, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1205,7 +1241,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1270,8 +1306,11 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages_alloc(i, pages, maxsize, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1283,7 +1322,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); if (unlikely(res < 0)) { kvfree(p); return res; @@ -1313,7 +1352,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -1355,7 +1394,7 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return false; } @@ -1400,7 +1439,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); /* for now */ return 0; } @@ -1442,8 +1481,10 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; + if (unlikely(iov_iter_is_discard(i))) + return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; size_t off; int idx; @@ -1481,11 +1522,13 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; - if (unlikely(new->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(new))) { WARN_ON(1); return NULL; } - if (new->type & ITER_BVEC) + if (unlikely(iov_iter_is_discard(new))) + return NULL; + if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); diff --git a/lib/kstrtox.c b/lib/kstrtox.c index 661a1e807bd1..1006bf70bf74 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -175,7 +175,7 @@ int _kstrtoul(const char *s, unsigned int base, unsigned long *res) rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (unsigned long long)(unsigned long)tmp) + if (tmp != (unsigned long)tmp) return -ERANGE; *res = tmp; return 0; @@ -191,7 +191,7 @@ int _kstrtol(const char *s, unsigned int base, long *res) rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (long long)(long)tmp) + if (tmp != (long)tmp) return -ERANGE; *res = tmp; return 0; @@ -222,7 +222,7 @@ int kstrtouint(const char *s, unsigned int base, unsigned int *res) rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (unsigned long long)(unsigned int)tmp) + if (tmp != (unsigned int)tmp) return -ERANGE; *res = tmp; return 0; @@ -253,7 +253,7 @@ int kstrtoint(const char *s, unsigned int base, int *res) rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (long long)(int)tmp) + if (tmp != (int)tmp) return -ERANGE; *res = tmp; return 0; @@ -268,7 +268,7 @@ int kstrtou16(const char *s, unsigned int base, u16 *res) rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (unsigned long long)(u16)tmp) + if (tmp != (u16)tmp) return -ERANGE; *res = tmp; return 0; @@ -283,7 +283,7 @@ int kstrtos16(const char *s, unsigned int base, s16 *res) rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (long long)(s16)tmp) + if (tmp != (s16)tmp) return -ERANGE; *res = tmp; return 0; @@ -298,7 +298,7 @@ int kstrtou8(const char *s, unsigned int base, u8 *res) rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (unsigned long long)(u8)tmp) + if (tmp != (u8)tmp) return -ERANGE; *res = tmp; return 0; @@ -313,7 +313,7 @@ int kstrtos8(const char *s, unsigned int base, s8 *res) rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; - if (tmp != (long long)(s8)tmp) + if (tmp != (s8)tmp) return -ERANGE; *res = tmp; return 0; diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 141734d255e4..0c9d3ad17e0f 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -43,30 +43,36 @@ /*-***************************** * Decompression functions *******************************/ -/* LZ4_decompress_generic() : - * This generic decompression function cover all use cases. - * It shall be instantiated several times, using different sets of directives - * Note that it is important this generic function is really inlined, + +#define DEBUGLOG(l, ...) {} /* disabled */ + +#ifndef assert +#define assert(condition) ((void)0) +#endif + +/* + * LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, * in order to remove useless branches during compilation optimization. */ static FORCE_INLINE int LZ4_decompress_generic( - const char * const source, - char * const dest, - int inputSize, + const char * const src, + char * const dst, + int srcSize, /* * If endOnInput == endOnInputSize, - * this value is the max size of Output Buffer. + * this value is `dstCapacity` */ int outputSize, /* endOnOutputSize, endOnInputSize */ - int endOnInput, + endCondition_directive endOnInput, /* full, partial */ - int partialDecoding, - /* only used if partialDecoding == partial */ - int targetOutputSize, + earlyEnd_directive partialDecoding, /* noDict, withPrefix64k, usingExtDict */ - int dict, - /* == dest when no prefix */ + dict_directive dict, + /* always <= dst, == dst when no prefix */ const BYTE * const lowPrefix, /* only if dict == usingExtDict */ const BYTE * const dictStart, @@ -74,35 +80,43 @@ static FORCE_INLINE int LZ4_decompress_generic( const size_t dictSize ) { - /* Local Variables */ - const BYTE *ip = (const BYTE *) source; - const BYTE * const iend = ip + inputSize; + const BYTE *ip = (const BYTE *) src; + const BYTE * const iend = ip + srcSize; - BYTE *op = (BYTE *) dest; + BYTE *op = (BYTE *) dst; BYTE * const oend = op + outputSize; BYTE *cpy; - BYTE *oexit = op + targetOutputSize; - const BYTE * const lowLimit = lowPrefix - dictSize; const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; - static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; - static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; + static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; + static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; const int safeDecode = (endOnInput == endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); + /* Set up the "end" pointers for the shortcut. */ + const BYTE *const shortiend = iend - + (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE *const shortoend = oend - + (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + + DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__, + srcSize, outputSize); + /* Special cases */ - /* targetOutputSize too high => decode everything */ - if ((partialDecoding) && (oexit > oend - MFLIMIT)) - oexit = oend - MFLIMIT; + assert(lowPrefix <= op); + assert(src != NULL); /* Empty output buffer */ if ((endOnInput) && (unlikely(outputSize == 0))) - return ((inputSize == 1) && (*ip == 0)) ? 0 : -1; + return ((srcSize == 1) && (*ip == 0)) ? 0 : -1; if ((!endOnInput) && (unlikely(outputSize == 0))) return (*ip == 0 ? 1 : -1); + if ((endOnInput) && unlikely(srcSize == 0)) + return -1; + /* Main Loop : decode sequences */ while (1) { size_t length; @@ -111,12 +125,74 @@ static FORCE_INLINE int LZ4_decompress_generic( /* get literal length */ unsigned int const token = *ip++; - length = token>>ML_BITS; + /* ip < iend before the increment */ + assert(!endOnInput || ip <= iend); + + /* + * A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough + * space, enter the shortcut and copy 16 bytes on behalf + * of the literals (in the fast mode, only 8 bytes can be + * safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes + * in a similar manner; but we ensure that there's enough + * space in the output for those 18 bytes earlier, upon + * entering the shortcut (in other words, there is a + * combined check for both stages). + */ + if ((endOnInput ? length != RUN_MASK : length <= 8) + /* + * strictly "less than" on input, to re-enter + * the loop with at least one byte + */ + && likely((endOnInput ? ip < shortiend : 1) & + (op <= shortoend))) { + /* Copy the literals */ + memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* + * The second stage: + * prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. + */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ((length != ML_MASK) && + (offset >= 8) && + (dict == withPrefix64k || match >= lowPrefix)) { + /* Copy the match. */ + memcpy(op + 0, match + 0, 8); + memcpy(op + 8, match + 8, 8); + memcpy(op + 16, match + 16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* + * The second stage didn't work out, but the info + * is ready. Propel it right to the point of match + * copying. + */ + goto _copy_match; + } + + /* decode literal length */ if (length == RUN_MASK) { unsigned int s; + if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) { + /* overflow detection */ + goto _output_error; + } do { s = *ip++; length += s; @@ -125,14 +201,14 @@ static FORCE_INLINE int LZ4_decompress_generic( : 1) & (s == 255)); if ((safeDecode) - && unlikely( - (size_t)(op + length) < (size_t)(op))) { + && unlikely((uptrval)(op) + + length < (uptrval)(op))) { /* overflow detection */ goto _output_error; } if ((safeDecode) - && unlikely( - (size_t)(ip + length) < (size_t)(ip))) { + && unlikely((uptrval)(ip) + + length < (uptrval)(ip))) { /* overflow detection */ goto _output_error; } @@ -140,16 +216,19 @@ static FORCE_INLINE int LZ4_decompress_generic( /* copy literals */ cpy = op + length; - if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT)) + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + + if (((endOnInput) && ((cpy > oend - MFLIMIT) || (ip + length > iend - (2 + 1 + LASTLITERALS)))) || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) { if (partialDecoding) { if (cpy > oend) { /* - * Error : - * write attempt beyond end of output buffer + * Partial decoding : + * stop in the middle of literal segment */ - goto _output_error; + cpy = oend; + length = oend - op; } if ((endOnInput) && (ip + length > iend)) { @@ -184,29 +263,43 @@ static FORCE_INLINE int LZ4_decompress_generic( memcpy(op, ip, length); ip += length; op += length; + /* Necessarily EOF, due to parsing restrictions */ - break; + if (!partialDecoding || (cpy == oend)) + break; + } else { + /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + LZ4_wildCopy(op, ip, cpy); + ip += length; + op = cpy; } - LZ4_wildCopy(op, ip, cpy); - ip += length; - op = cpy; - /* get offset */ offset = LZ4_readLE16(ip); ip += 2; match = op - offset; - if ((checkOffset) && (unlikely(match < lowLimit))) { + /* get matchlength */ + length = token & ML_MASK; + +_copy_match: + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { /* Error : offset outside buffers */ goto _output_error; } /* costs ~1%; silence an msan warning when offset == 0 */ - LZ4_write32(op, (U32)offset); + /* + * note : when partialDecoding, there is no guarantee that + * at least 4 bytes remain available in output buffer + */ + if (!partialDecoding) { + assert(oend > op); + assert(oend - op >= 4); + + LZ4_write32(op, (U32)offset); + } - /* get matchlength */ - length = token & ML_MASK; if (length == ML_MASK) { unsigned int s; @@ -221,7 +314,7 @@ static FORCE_INLINE int LZ4_decompress_generic( if ((safeDecode) && unlikely( - (size_t)(op + length) < (size_t)op)) { + (uptrval)(op) + length < (uptrval)op)) { /* overflow detection */ goto _output_error; } @@ -229,24 +322,26 @@ static FORCE_INLINE int LZ4_decompress_generic( length += MINMATCH; - /* check external dictionary */ + /* match starting within external dictionary */ if ((dict == usingExtDict) && (match < lowPrefix)) { if (unlikely(op + length > oend - LASTLITERALS)) { /* doesn't respect parsing restriction */ - goto _output_error; + if (!partialDecoding) + goto _output_error; + length = min(length, (size_t)(oend - op)); } if (length <= (size_t)(lowPrefix - match)) { /* - * match can be copied as a single segment - * from external dictionary + * match fits entirely within external + * dictionary : just copy */ memmove(op, dictEnd - (lowPrefix - match), length); op += length; } else { /* - * match encompass external + * match stretches into both external * dictionary and current block */ size_t const copySize = (size_t)(lowPrefix - match); @@ -254,7 +349,6 @@ static FORCE_INLINE int LZ4_decompress_generic( memcpy(op, dictEnd - copySize, copySize); op += copySize; - if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ BYTE * const endOfMatch = op + restSize; @@ -267,23 +361,44 @@ static FORCE_INLINE int LZ4_decompress_generic( op += restSize; } } - continue; } /* copy match within block */ cpy = op + length; - if (unlikely(offset < 8)) { - const int dec64 = dec64table[offset]; + /* + * partialDecoding : + * may not respect endBlock parsing restrictions + */ + assert(op <= oend); + if (partialDecoding && + (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = min(length, (size_t)(oend - op)); + const BYTE * const matchEnd = match + mlen; + BYTE * const copyEnd = op + mlen; + + if (matchEnd > op) { + /* overlap copy */ + while (op < copyEnd) + *op++ = *match++; + } else { + memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) + break; + continue; + } + if (unlikely(offset < 8)) { op[0] = match[0]; op[1] = match[1]; op[2] = match[2]; op[3] = match[3]; - match += dec32table[offset]; + match += inc32table[offset]; memcpy(op + 4, match, 4); - match -= dec64; + match -= dec64table[offset]; } else { LZ4_copy8(op, match); match += 8; @@ -291,7 +406,7 @@ static FORCE_INLINE int LZ4_decompress_generic( op += 8; - if (unlikely(cpy > oend - 12)) { + if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1); if (cpy > oend - LASTLITERALS) { @@ -307,60 +422,139 @@ static FORCE_INLINE int LZ4_decompress_generic( match += oCopyLimit - op; op = oCopyLimit; } - while (op < cpy) *op++ = *match++; } else { LZ4_copy8(op, match); - if (length > 16) LZ4_wildCopy(op + 8, match + 8, cpy); } - - op = cpy; /* correction */ + op = cpy; /* wildcopy correction */ } /* end of decoding */ if (endOnInput) { /* Nb of output bytes decoded */ - return (int) (((char *)op) - dest); + return (int) (((char *)op) - dst); } else { /* Nb of input bytes read */ - return (int) (((const char *)ip) - source); + return (int) (((const char *)ip) - src); } /* Overflow error detected */ _output_error: - return -1; + return (int) (-(((const char *)ip) - src)) - 1; } int LZ4_decompress_safe(const char *source, char *dest, int compressedSize, int maxDecompressedSize) { - return LZ4_decompress_generic(source, dest, compressedSize, - maxDecompressedSize, endOnInputSize, full, 0, - noDict, (BYTE *)dest, NULL, 0); + return LZ4_decompress_generic(source, dest, + compressedSize, maxDecompressedSize, + endOnInputSize, decode_full_block, + noDict, (BYTE *)dest, NULL, 0); } -int LZ4_decompress_safe_partial(const char *source, char *dest, - int compressedSize, int targetOutputSize, int maxDecompressedSize) +int LZ4_decompress_safe_partial(const char *src, char *dst, + int compressedSize, int targetOutputSize, int dstCapacity) { - return LZ4_decompress_generic(source, dest, compressedSize, - maxDecompressedSize, endOnInputSize, partial, - targetOutputSize, noDict, (BYTE *)dest, NULL, 0); + dstCapacity = min(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + endOnInputSize, partial_decode, + noDict, (BYTE *)dst, NULL, 0); } int LZ4_decompress_fast(const char *source, char *dest, int originalSize) { return LZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, full, 0, withPrefix64k, - (BYTE *)(dest - 64 * KB), NULL, 64 * KB); + endOnOutputSize, decode_full_block, + withPrefix64k, + (BYTE *)dest - 64 * KB, NULL, 0); +} + +/* ===== Instantiate a few more decoding cases, used more than once. ===== */ + +int LZ4_decompress_safe_withPrefix64k(const char *source, char *dest, + int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, + withPrefix64k, + (BYTE *)dest - 64 * KB, NULL, 0); +} + +static int LZ4_decompress_safe_withSmallPrefix(const char *source, char *dest, + int compressedSize, + int maxOutputSize, + size_t prefixSize) +{ + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, + noDict, + (BYTE *)dest - prefixSize, NULL, 0); +} + +int LZ4_decompress_safe_forceExtDict(const char *source, char *dest, + int compressedSize, int maxOutputSize, + const void *dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, + usingExtDict, (BYTE *)dest, + (const BYTE *)dictStart, dictSize); } +static int LZ4_decompress_fast_extDict(const char *source, char *dest, + int originalSize, + const void *dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, + 0, originalSize, + endOnOutputSize, decode_full_block, + usingExtDict, (BYTE *)dest, + (const BYTE *)dictStart, dictSize); +} + +/* + * The "double dictionary" mode, for use with e.g. ring buffers: the first part + * of the dictionary is passed as prefix, and the second via dictStart + dictSize. + * These routines are used only once, in LZ4_decompress_*_continue(). + */ +static FORCE_INLINE +int LZ4_decompress_safe_doubleDict(const char *source, char *dest, + int compressedSize, int maxOutputSize, + size_t prefixSize, + const void *dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, + usingExtDict, (BYTE *)dest - prefixSize, + (const BYTE *)dictStart, dictSize); +} + +static FORCE_INLINE +int LZ4_decompress_fast_doubleDict(const char *source, char *dest, + int originalSize, size_t prefixSize, + const void *dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, + 0, originalSize, + endOnOutputSize, decode_full_block, + usingExtDict, (BYTE *)dest - prefixSize, + (const BYTE *)dictStart, dictSize); +} + +/* ===== streaming decompression functions ===== */ + int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, const char *dictionary, int dictSize) { - LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode; + LZ4_streamDecode_t_internal *lz4sd = + &LZ4_streamDecode->internal_donotuse; lz4sd->prefixSize = (size_t) dictSize; lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize; @@ -382,35 +576,51 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode, const char *source, char *dest, int compressedSize, int maxOutputSize) { - LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse; + LZ4_streamDecode_t_internal *lz4sd = + &LZ4_streamDecode->internal_donotuse; int result; - if (lz4sd->prefixEnd == (BYTE *)dest) { - result = LZ4_decompress_generic(source, dest, - compressedSize, - maxOutputSize, - endOnInputSize, full, 0, - usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, - lz4sd->externalDict, - lz4sd->extDictSize); - + if (lz4sd->prefixSize == 0) { + /* The first call, no dictionary yet. */ + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_safe(source, dest, + compressedSize, maxOutputSize); + if (result <= 0) + return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE *)dest + result; + } else if (lz4sd->prefixEnd == (BYTE *)dest) { + /* They're rolling the current segment. */ + if (lz4sd->prefixSize >= 64 * KB - 1) + result = LZ4_decompress_safe_withPrefix64k(source, dest, + compressedSize, maxOutputSize); + else if (lz4sd->extDictSize == 0) + result = LZ4_decompress_safe_withSmallPrefix(source, + dest, compressedSize, maxOutputSize, + lz4sd->prefixSize); + else + result = LZ4_decompress_safe_doubleDict(source, dest, + compressedSize, maxOutputSize, + lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; - lz4sd->prefixSize += result; - lz4sd->prefixEnd += result; + lz4sd->prefixEnd += result; } else { + /* + * The buffer wraps around, or they're + * switching to another buffer. + */ lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; - result = LZ4_decompress_generic(source, dest, + result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, - endOnInputSize, full, 0, - usingExtDict, (BYTE *)dest, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize = result; - lz4sd->prefixEnd = (BYTE *)dest + result; + lz4sd->prefixEnd = (BYTE *)dest + result; } return result; @@ -422,75 +632,66 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode, LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse; int result; - if (lz4sd->prefixEnd == (BYTE *)dest) { - result = LZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, full, 0, - usingExtDict, - lz4sd->prefixEnd - lz4sd->prefixSize, - lz4sd->externalDict, lz4sd->extDictSize); - + if (lz4sd->prefixSize == 0) { + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_fast(source, dest, originalSize); + if (result <= 0) + return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE *)dest + originalSize; + } else if (lz4sd->prefixEnd == (BYTE *)dest) { + if (lz4sd->prefixSize >= 64 * KB - 1 || + lz4sd->extDictSize == 0) + result = LZ4_decompress_fast(source, dest, + originalSize); + else + result = LZ4_decompress_fast_doubleDict(source, dest, + originalSize, lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; - lz4sd->prefixSize += originalSize; - lz4sd->prefixEnd += originalSize; + lz4sd->prefixEnd += originalSize; } else { lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; - result = LZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, full, 0, - usingExtDict, (BYTE *)dest, - lz4sd->externalDict, lz4sd->extDictSize); + result = LZ4_decompress_fast_extDict(source, dest, + originalSize, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize = originalSize; - lz4sd->prefixEnd = (BYTE *)dest + originalSize; + lz4sd->prefixEnd = (BYTE *)dest + originalSize; } - return result; } -/* - * Advanced decoding functions : - * *_usingDict() : - * These decoding functions work the same as "_continue" ones, - * the dictionary must be explicitly provided within parameters - */ -static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source, - char *dest, int compressedSize, int maxOutputSize, int safe, - const char *dictStart, int dictSize) +int LZ4_decompress_safe_usingDict(const char *source, char *dest, + int compressedSize, int maxOutputSize, + const char *dictStart, int dictSize) { if (dictSize == 0) - return LZ4_decompress_generic(source, dest, - compressedSize, maxOutputSize, safe, full, 0, - noDict, (BYTE *)dest, NULL, 0); - if (dictStart + dictSize == dest) { - if (dictSize >= (int)(64 * KB - 1)) - return LZ4_decompress_generic(source, dest, - compressedSize, maxOutputSize, safe, full, 0, - withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0); - return LZ4_decompress_generic(source, dest, compressedSize, - maxOutputSize, safe, full, 0, noDict, - (BYTE *)dest - dictSize, NULL, 0); + return LZ4_decompress_safe(source, dest, + compressedSize, maxOutputSize); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 * KB - 1) + return LZ4_decompress_safe_withPrefix64k(source, dest, + compressedSize, maxOutputSize); + return LZ4_decompress_safe_withSmallPrefix(source, dest, + compressedSize, maxOutputSize, dictSize); } - return LZ4_decompress_generic(source, dest, compressedSize, - maxOutputSize, safe, full, 0, usingExtDict, - (BYTE *)dest, (const BYTE *)dictStart, dictSize); -} - -int LZ4_decompress_safe_usingDict(const char *source, char *dest, - int compressedSize, int maxOutputSize, - const char *dictStart, int dictSize) -{ - return LZ4_decompress_usingDict_generic(source, dest, - compressedSize, maxOutputSize, 1, dictStart, dictSize); + return LZ4_decompress_safe_forceExtDict(source, dest, + compressedSize, maxOutputSize, dictStart, dictSize); } int LZ4_decompress_fast_usingDict(const char *source, char *dest, - int originalSize, const char *dictStart, int dictSize) + int originalSize, + const char *dictStart, int dictSize) { - return LZ4_decompress_usingDict_generic(source, dest, 0, - originalSize, 0, dictStart, dictSize); + if (dictSize == 0 || dictStart + dictSize == dest) + return LZ4_decompress_fast(source, dest, originalSize); + + return LZ4_decompress_fast_extDict(source, dest, originalSize, + dictStart, dictSize); } #ifndef STATIC diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h index 00a0b58a0871..1a7fa9d9170f 100644 --- a/lib/lz4/lz4defs.h +++ b/lib/lz4/lz4defs.h @@ -75,6 +75,11 @@ typedef uintptr_t uptrval; #define WILDCOPYLENGTH 8 #define LASTLITERALS 5 #define MFLIMIT (WILDCOPYLENGTH + MINMATCH) +/* + * ensure it's possible to write 2 x wildcopyLength + * without overflowing output buffer + */ +#define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) /* Increase this value ==> compression run slower on incompressible data */ #define LZ4_SKIPTRIGGER 6 @@ -222,6 +227,8 @@ typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; -typedef enum { full = 0, partial = 1 } earlyEnd_directive; +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; + +#define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) #endif diff --git a/lib/memcat_p.c b/lib/memcat_p.c new file mode 100644 index 000000000000..b810fbc66962 --- /dev/null +++ b/lib/memcat_p.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +/* + * Merge two NULL-terminated pointer arrays into a newly allocated + * array, which is also NULL-terminated. Nomenclature is inspired by + * memset_p() and memcat() found elsewhere in the kernel source tree. + */ +void **__memcat_p(void **a, void **b) +{ + void **p = a, **new; + int nr; + + /* count the elements in both arrays */ + for (nr = 0, p = a; *p; nr++, p++) + ; + for (p = b; *p; nr++, p++) + ; + /* one for the NULL-terminator */ + nr++; + + new = kmalloc_array(nr, sizeof(void *), GFP_KERNEL); + if (!new) + return NULL; + + /* nr -> last index; p points to NULL in b[] */ + for (nr--; nr >= 0; nr--, p = p == b ? &a[nr] : p - 1) + new[nr] = *p; + + return new; +} +EXPORT_SYMBOL_GPL(__memcat_p); + diff --git a/lib/nlattr.c b/lib/nlattr.c index e335bcafa9e4..d26de6156b97 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -45,12 +45,11 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { }; static int validate_nla_bitfield32(const struct nlattr *nla, - u32 *valid_flags_allowed) + const u32 *valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); - u32 *valid_flags_mask = valid_flags_allowed; - if (!valid_flags_allowed) + if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ @@ -68,11 +67,99 @@ static int validate_nla_bitfield32(const struct nlattr *nla, return 0; } +static int nla_validate_array(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + const struct nlattr *entry; + int rem; + + nla_for_each_attr(entry, head, len, rem) { + int ret; + + if (nla_len(entry) == 0) + continue; + + if (nla_len(entry) < NLA_HDRLEN) { + NL_SET_ERR_MSG_ATTR(extack, entry, + "Array element too short"); + return -ERANGE; + } + + ret = nla_validate(nla_data(entry), nla_len(entry), + maxtype, policy, extack); + if (ret < 0) + return ret; + } + + return 0; +} + +static int nla_validate_int_range(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + bool validate_min, validate_max; + s64 value; + + validate_min = pt->validation_type == NLA_VALIDATE_RANGE || + pt->validation_type == NLA_VALIDATE_MIN; + validate_max = pt->validation_type == NLA_VALIDATE_RANGE || + pt->validation_type == NLA_VALIDATE_MAX; + + switch (pt->type) { + case NLA_U8: + value = nla_get_u8(nla); + break; + case NLA_U16: + value = nla_get_u16(nla); + break; + case NLA_U32: + value = nla_get_u32(nla); + break; + case NLA_S8: + value = nla_get_s8(nla); + break; + case NLA_S16: + value = nla_get_s16(nla); + break; + case NLA_S32: + value = nla_get_s32(nla); + break; + case NLA_S64: + value = nla_get_s64(nla); + break; + case NLA_U64: + /* treat this one specially, since it may not fit into s64 */ + if ((validate_min && nla_get_u64(nla) < pt->min) || + (validate_max && nla_get_u64(nla) > pt->max)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "integer out of range"); + return -ERANGE; + } + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if ((validate_min && value < pt->min) || + (validate_max && value > pt->max)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "integer out of range"); + return -ERANGE; + } + + return 0; +} + static int validate_nla(const struct nlattr *nla, int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); + int err = -ERANGE; if (type <= 0 || type > maxtype) return 0; @@ -81,22 +168,40 @@ static int validate_nla(const struct nlattr *nla, int maxtype, BUG_ON(pt->type > NLA_TYPE_MAX); - if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { + if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) || + (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); } switch (pt->type) { + case NLA_EXACT_LEN: + if (attrlen != pt->len) + goto out_err; + break; + + case NLA_REJECT: + if (extack && pt->validation_data) { + NL_SET_BAD_ATTR(extack, nla); + extack->_msg = pt->validation_data; + return -EINVAL; + } + err = -EINVAL; + goto out_err; + case NLA_FLAG: if (attrlen > 0) - return -ERANGE; + goto out_err; break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) - return -ERANGE; + goto out_err; - return validate_nla_bitfield32(nla, pt->validation_data); + err = validate_nla_bitfield32(nla, pt->validation_data); + if (err) + goto out_err; + break; case NLA_NUL_STRING: if (pt->len) @@ -104,13 +209,15 @@ static int validate_nla(const struct nlattr *nla, int maxtype, else minlen = attrlen; - if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) - return -EINVAL; + if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { + err = -EINVAL; + goto out_err; + } /* fall through */ case NLA_STRING: if (attrlen < 1) - return -ERANGE; + goto out_err; if (pt->len) { char *buf = nla_data(nla); @@ -119,32 +226,58 @@ static int validate_nla(const struct nlattr *nla, int maxtype, attrlen--; if (attrlen > pt->len) - return -ERANGE; + goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) - return -ERANGE; + goto out_err; break; - case NLA_NESTED_COMPAT: - if (attrlen < pt->len) - return -ERANGE; - if (attrlen < NLA_ALIGN(pt->len)) - break; - if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN) - return -ERANGE; - nla = nla_data(nla) + NLA_ALIGN(pt->len); - if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN + nla_len(nla)) - return -ERANGE; - break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; + if (attrlen < NLA_HDRLEN) + goto out_err; + if (pt->validation_data) { + err = nla_validate(nla_data(nla), nla_len(nla), pt->len, + pt->validation_data, extack); + if (err < 0) { + /* + * return directly to preserve the inner + * error message/attribute pointer + */ + return err; + } + } + break; + case NLA_NESTED_ARRAY: + /* a nested array attribute is allowed to be empty; if its not, + * it must have a size of at least NLA_HDRLEN. + */ + if (attrlen == 0) + break; + if (attrlen < NLA_HDRLEN) + goto out_err; + if (pt->validation_data) { + int err; + + err = nla_validate_array(nla_data(nla), nla_len(nla), + pt->len, pt->validation_data, + extack); + if (err < 0) { + /* + * return directly to preserve the inner + * error message/attribute pointer + */ + return err; + } + } + break; default: if (pt->len) minlen = pt->len; @@ -152,10 +285,34 @@ static int validate_nla(const struct nlattr *nla, int maxtype, minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) - return -ERANGE; + goto out_err; + } + + /* further validation */ + switch (pt->validation_type) { + case NLA_VALIDATE_NONE: + /* nothing to do */ + break; + case NLA_VALIDATE_RANGE: + case NLA_VALIDATE_MIN: + case NLA_VALIDATE_MAX: + err = nla_validate_int_range(pt, nla, extack); + if (err) + return err; + break; + case NLA_VALIDATE_FUNCTION: + if (pt->validate) { + err = pt->validate(nla, extack); + if (err) + return err; + } + break; } return 0; +out_err: + NL_SET_ERR_MSG_ATTR(extack, nla, "Attribute failed policy validation"); + return err; } /** @@ -180,13 +337,10 @@ int nla_validate(const struct nlattr *head, int len, int maxtype, int rem; nla_for_each_attr(nla, head, len, rem) { - int err = validate_nla(nla, maxtype, policy); + int err = validate_nla(nla, maxtype, policy, extack); - if (err < 0) { - if (extack) - extack->bad_attr = nla; + if (err < 0) return err; - } } return 0; @@ -237,42 +391,63 @@ EXPORT_SYMBOL(nla_policy_len); * * Returns 0 on success or a negative error code. */ -int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static int __nla_parse(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + bool strict, const struct nla_policy *policy, + struct netlink_ext_ack *extack) { const struct nlattr *nla; - int rem, err; + int rem; memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); - if (type > 0 && type <= maxtype) { - if (policy) { - err = validate_nla(nla, maxtype, policy); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "Attribute failed policy validation"); - goto errout; - } + if (type == 0 || type > maxtype) { + if (strict) { + NL_SET_ERR_MSG(extack, "Unknown attribute type"); + return -EINVAL; } + continue; + } + if (policy) { + int err = validate_nla(nla, maxtype, policy, extack); - tb[type] = (struct nlattr *)nla; + if (err < 0) + return err; } + + tb[type] = (struct nlattr *)nla; } - if (unlikely(rem > 0)) + if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); + NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); + if (strict) + return -EINVAL; + } - err = 0; -errout: - return err; + return 0; +} + +int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, + int len, const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, false, policy, extack); } EXPORT_SYMBOL(nla_parse); +int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, + int len, const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, true, policy, extack); +} +EXPORT_SYMBOL(nla_parse_strict); + /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream diff --git a/lib/parser.c b/lib/parser.c index 3278958b472a..dd70e5e6c9e2 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -131,13 +131,10 @@ static int match_number(substring_t *s, int *result, int base) char *buf; int ret; long val; - size_t len = s->to - s->from; - buf = kmalloc(len + 1, GFP_KERNEL); + buf = match_strdup(s); if (!buf) return -ENOMEM; - memcpy(buf, s->from, len); - buf[len] = '\0'; ret = 0; val = simple_strtol(buf, &endp, base); @@ -166,13 +163,10 @@ static int match_u64int(substring_t *s, u64 *result, int base) char *buf; int ret; u64 val; - size_t len = s->to - s->from; - buf = kmalloc(len + 1, GFP_KERNEL); + buf = match_strdup(s); if (!buf) return -ENOMEM; - memcpy(buf, s->from, len); - buf[len] = '\0'; ret = kstrtoull(buf, base, &val); if (!ret) @@ -327,10 +321,6 @@ EXPORT_SYMBOL(match_strlcpy); */ char *match_strdup(const substring_t *s) { - size_t sz = s->to - s->from + 1; - char *p = kmalloc(sz, GFP_KERNEL); - if (p) - match_strlcpy(p, s, sz); - return p; + return kmemdup_nul(s->from, s->to - s->from, GFP_KERNEL); } EXPORT_SYMBOL(match_strdup); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9f96fa7bc000..de10b8c0bff6 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -356,11 +356,35 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + + percpu_ref_resurrect(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); + +/** + * percpu_ref_resurrect - modify a percpu refcount from dead to live + * @ref: perpcu_ref to resurrect + * + * Modify @ref so that it's in the same state as before percpu_ref_kill() was + * called. @ref must be dead but must not yet have exited. + * + * If @ref->release() frees @ref then the caller is responsible for + * guaranteeing that @ref->release() does not get called while this + * function is in progress. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_resurrect(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)); + WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); @@ -368,4 +392,4 @@ void percpu_ref_reinit(struct percpu_ref *ref) spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } -EXPORT_SYMBOL_GPL(percpu_ref_reinit); +EXPORT_SYMBOL_GPL(percpu_ref_resurrect); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bc03ecc4dfd2..1106bb6aa01e 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,15 +38,13 @@ #include #include #include +#include -/* Number of nodes in fully populated tree of given height */ -static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; - /* * Radix tree node cache. */ -static struct kmem_cache *radix_tree_node_cachep; +struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has @@ -98,24 +96,7 @@ static inline void *node_to_entry(void *ptr) return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } -#define RADIX_TREE_RETRY node_to_entry(NULL) - -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/* Sibling slots point directly to another slot in the same node */ -static inline -bool is_sibling_entry(const struct radix_tree_node *parent, void *node) -{ - void __rcu **ptr = node; - return (parent->slots <= ptr) && - (ptr < parent->slots + RADIX_TREE_MAP_SIZE); -} -#else -static inline -bool is_sibling_entry(const struct radix_tree_node *parent, void *node) -{ - return false; -} -#endif +#define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) @@ -129,24 +110,13 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); -#ifdef CONFIG_RADIX_TREE_MULTIORDER - if (radix_tree_is_internal_node(entry)) { - if (is_sibling_entry(parent, entry)) { - void __rcu **sibentry; - sibentry = (void __rcu **) entry_to_node(entry); - offset = get_slot_offset(parent, sibentry); - entry = rcu_dereference_raw(*sibentry); - } - } -#endif - *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { - return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); + return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, @@ -169,32 +139,32 @@ static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { - root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); + root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { - root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); + root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { - root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1; + root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { - return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT)); + return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { - return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT; + return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { - return !!(root->gfp_mask & ROOT_IS_IDR); + return !!(root->xa_flags & ROOT_IS_IDR); } /* @@ -254,7 +224,7 @@ radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, static unsigned int iter_offset(const struct radix_tree_iter *iter) { - return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; + return iter->index & RADIX_TREE_MAP_MASK; } /* @@ -277,99 +247,6 @@ static unsigned long next_index(unsigned long index, return (index & ~node_maxindex(node)) + (offset << node->shift); } -#ifndef __KERNEL__ -static void dump_node(struct radix_tree_node *node, unsigned long index) -{ - unsigned long i; - - pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", - node, node->offset, index, index | node_maxindex(node), - node->parent, - node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->exceptional); - - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - unsigned long first = index | (i << node->shift); - unsigned long last = first | ((1UL << node->shift) - 1); - void *entry = node->slots[i]; - if (!entry) - continue; - if (entry == RADIX_TREE_RETRY) { - pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", - i, first, last, node); - } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", - entry, i, first, last, node); - } else if (is_sibling_entry(node, entry)) { - pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", - entry, i, first, last, node, - *(void **)entry_to_node(entry)); - } else { - dump_node(entry_to_node(entry), first); - } - } -} - -/* For debug */ -static void radix_tree_dump(struct radix_tree_root *root) -{ - pr_debug("radix root: %p rnode %p tags %x\n", - root, root->rnode, - root->gfp_mask >> ROOT_TAG_SHIFT); - if (!radix_tree_is_internal_node(root->rnode)) - return; - dump_node(entry_to_node(root->rnode), 0); -} - -static void dump_ida_node(void *entry, unsigned long index) -{ - unsigned long i; - - if (!entry) - return; - - if (radix_tree_is_internal_node(entry)) { - struct radix_tree_node *node = entry_to_node(entry); - - pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n", - node, node->offset, index * IDA_BITMAP_BITS, - ((index | node_maxindex(node)) + 1) * - IDA_BITMAP_BITS - 1, - node->parent, node->tags[0][0], node->shift, - node->count); - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) - dump_ida_node(node->slots[i], - index | (i << node->shift)); - } else if (radix_tree_exceptional_entry(entry)) { - pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n", - entry, (int)(index & RADIX_TREE_MAP_MASK), - index * IDA_BITMAP_BITS, - index * IDA_BITMAP_BITS + BITS_PER_LONG - - RADIX_TREE_EXCEPTIONAL_SHIFT, - (unsigned long)entry >> - RADIX_TREE_EXCEPTIONAL_SHIFT); - } else { - struct ida_bitmap *bitmap = entry; - - pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap, - (int)(index & RADIX_TREE_MAP_MASK), - index * IDA_BITMAP_BITS, - (index + 1) * IDA_BITMAP_BITS - 1); - for (i = 0; i < IDA_BITMAP_LONGS; i++) - pr_cont(" %lx", bitmap->bitmap[i]); - pr_cont("\n"); - } -} - -static void ida_dump(struct ida *ida) -{ - struct radix_tree_root *root = &ida->ida_rt; - pr_debug("ida: %p node %p free %d\n", ida, root->rnode, - root->gfp_mask >> ROOT_TAG_SHIFT); - dump_ida_node(root->rnode, 0); -} -#endif - /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -378,7 +255,7 @@ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, - unsigned int count, unsigned int exceptional) + unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; @@ -425,14 +302,14 @@ out: ret->shift = shift; ret->offset = offset; ret->count = count; - ret->exceptional = exceptional; + ret->nr_values = nr_values; ret->parent = parent; - ret->root = root; + ret->array = root; } return ret; } -static void radix_tree_node_rcu_free(struct rcu_head *head) +void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); @@ -530,77 +407,10 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/* - * Preload with enough objects to ensure that we can split a single entry - * of order @old_order into many entries of size @new_order - */ -int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, - gfp_t gfp_mask) -{ - unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); - unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - - (new_order / RADIX_TREE_MAP_SHIFT); - unsigned nr = 0; - - WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - BUG_ON(new_order >= old_order); - - while (layers--) - nr = nr * RADIX_TREE_MAP_SIZE + 1; - return __radix_tree_preload(gfp_mask, top * nr); -} -#endif - -/* - * The same as function above, but preload number of nodes required to insert - * (1 << order) continuous naturally-aligned elements. - */ -int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) -{ - unsigned long nr_subtrees; - int nr_nodes, subtree_height; - - /* Preloading doesn't help anything with this gfp mask, skip it */ - if (!gfpflags_allow_blocking(gfp_mask)) { - preempt_disable(); - return 0; - } - - /* - * Calculate number and height of fully populated subtrees it takes to - * store (1 << order) elements. - */ - nr_subtrees = 1 << order; - for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; - subtree_height++) - nr_subtrees >>= RADIX_TREE_MAP_SHIFT; - - /* - * The worst case is zero height tree with a single item at index 0 and - * then inserting items starting at ULONG_MAX - (1 << order). - * - * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to - * 0-index item. - */ - nr_nodes = RADIX_TREE_MAX_PATH; - - /* Plus branch to fully populated subtrees. */ - nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; - - /* Root node is shared. */ - nr_nodes--; - - /* Plus nodes required to build subtrees. */ - nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; - - return __radix_tree_preload(gfp_mask, nr_nodes); -} - static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { - struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; @@ -629,7 +439,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; - entry = rcu_dereference_raw(root->rnode); + entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; @@ -656,9 +466,9 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; - } else if (radix_tree_exceptional_entry(entry)) { - /* Moving an exceptional root->rnode to a node */ - node->exceptional = 1; + } else if (xa_is_value(entry)) { + /* Moving a value entry root->xa_head to a node */ + node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need @@ -666,7 +476,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); - rcu_assign_pointer(root->rnode, entry); + rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: @@ -677,13 +487,12 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, - radix_tree_update_node_t update_node) +static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { - struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) @@ -692,15 +501,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. + * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; - if (!radix_tree_is_internal_node(child) && node->shift) + + /* + * For an IDR, we must not shrink entry 0 into the root in + * case somebody calls idr_replace() with a pointer that + * appears to be an internal entry + */ + if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) @@ -711,9 +525,9 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. + * one (root->xa_head) as far as dependent read barriers go. */ - root->rnode = (void __rcu *)child; + root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); @@ -738,8 +552,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; - if (update_node) - update_node(node); } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -751,8 +563,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node, - radix_tree_update_node_t update_node) + struct radix_tree_node *node) { bool deleted = false; @@ -761,9 +572,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node_to_entry(node) == - rcu_dereference_raw(root->rnode)) - deleted |= radix_tree_shrink(root, - update_node); + rcu_dereference_raw(root->xa_head)) + deleted |= radix_tree_shrink(root); return deleted; } @@ -778,7 +588,7 @@ static bool delete_node(struct radix_tree_root *root, */ if (!is_idr(root)) root_tag_clear_all(root); - root->rnode = NULL; + root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -795,7 +605,6 @@ static bool delete_node(struct radix_tree_root *root, * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key - * @order: index occupies 2^order aligned slots * @nodep: returns node * @slotp: returns slot * @@ -803,36 +612,34 @@ static bool delete_node(struct radix_tree_root *root, * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are - * allocated and @root->rnode is used as a direct slot instead of + * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ -int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - unsigned order, struct radix_tree_node **nodep, - void __rcu ***slotp) +static int __radix_tree_create(struct radix_tree_root *root, + unsigned long index, struct radix_tree_node **nodep, + void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; - void __rcu **slot = (void __rcu **)&root->rnode; + void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; - unsigned long max = index | ((1UL << order) - 1); + unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ - if (order > 0 && max == ((1UL << order) - 1)) - max++; if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; - child = rcu_dereference_raw(root->rnode); + child = rcu_dereference_raw(root->xa_head); } - while (shift > order) { + while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ @@ -875,8 +682,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); - if (radix_tree_is_internal_node(entry) && - !is_sibling_entry(child, entry)) { + if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; @@ -894,96 +700,30 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) } } -#ifdef CONFIG_RADIX_TREE_MULTIORDER static inline int insert_entries(struct radix_tree_node *node, - void __rcu **slot, void *item, unsigned order, bool replace) -{ - struct radix_tree_node *child; - unsigned i, n, tag, offset, tags = 0; - - if (node) { - if (order > node->shift) - n = 1 << (order - node->shift); - else - n = 1; - offset = get_slot_offset(node, slot); - } else { - n = 1; - offset = 0; - } - - if (n > 1) { - offset = offset & ~(n - 1); - slot = &node->slots[offset]; - } - child = node_to_entry(slot); - - for (i = 0; i < n; i++) { - if (slot[i]) { - if (replace) { - node->count--; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tag_get(node, tag, offset + i)) - tags |= 1 << tag; - } else - return -EEXIST; - } - } - - for (i = 0; i < n; i++) { - struct radix_tree_node *old = rcu_dereference_raw(slot[i]); - if (i) { - rcu_assign_pointer(slot[i], child); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_clear(node, tag, offset + i); - } else { - rcu_assign_pointer(slot[i], item); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - } - if (radix_tree_is_internal_node(old) && - !is_sibling_entry(node, old) && - (old != RADIX_TREE_RETRY)) - radix_tree_free_nodes(old); - if (radix_tree_exceptional_entry(old)) - node->exceptional--; - } - if (node) { - node->count += n; - if (radix_tree_exceptional_entry(item)) - node->exceptional += n; - } - return n; -} -#else -static inline int insert_entries(struct radix_tree_node *node, - void __rcu **slot, void *item, unsigned order, bool replace) + void __rcu **slot, void *item, bool replace) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; - if (radix_tree_exceptional_entry(item)) - node->exceptional++; + if (xa_is_value(item)) + node->nr_values++; } return 1; } -#endif /** * __radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key - * @order: key covers the 2^order indices around index * @item: item to insert * * Insert an item into the radix tree at position @index. */ -int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, - unsigned order, void *item) +int radix_tree_insert(struct radix_tree_root *root, unsigned long index, + void *item) { struct radix_tree_node *node; void __rcu **slot; @@ -991,11 +731,11 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, BUG_ON(radix_tree_is_internal_node(item)); - error = __radix_tree_create(root, index, order, &node, &slot); + error = __radix_tree_create(root, index, &node, &slot); if (error) return error; - error = insert_entries(node, slot, item, order, false); + error = insert_entries(node, slot, item, false); if (error < 0) return error; @@ -1010,7 +750,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, return 0; } -EXPORT_SYMBOL(__radix_tree_insert); +EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree @@ -1023,7 +763,7 @@ EXPORT_SYMBOL(__radix_tree_insert); * tree @root. * * Until there is more than one item in the tree, no nodes are - * allocated and @root->rnode is used as a direct slot instead of + * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, @@ -1036,7 +776,7 @@ void *__radix_tree_lookup(const struct radix_tree_root *root, restart: parent = NULL; - slot = (void __rcu **)&root->rnode; + slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; @@ -1049,6 +789,8 @@ void *__radix_tree_lookup(const struct radix_tree_root *root, parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; + if (parent->shift == 0) + break; } if (nodep) @@ -1100,36 +842,12 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); -static inline void replace_sibling_entries(struct radix_tree_node *node, - void __rcu **slot, int count, int exceptional) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - void *ptr = node_to_entry(slot); - unsigned offset = get_slot_offset(node, slot) + 1; - - while (offset < RADIX_TREE_MAP_SIZE) { - if (rcu_dereference_raw(node->slots[offset]) != ptr) - break; - if (count < 0) { - node->slots[offset] = NULL; - node->count--; - } - node->exceptional += exceptional; - offset++; - } -#endif -} - static void replace_slot(void __rcu **slot, void *item, - struct radix_tree_node *node, int count, int exceptional) + struct radix_tree_node *node, int count, int values) { - if (WARN_ON_ONCE(radix_tree_is_internal_node(item))) - return; - - if (node && (count || exceptional)) { + if (node && (count || values)) { node->count += count; - node->exceptional += exceptional; - replace_sibling_entries(node, slot, count, exceptional); + node->nr_values += values; } rcu_assign_pointer(*slot, item); @@ -1172,37 +890,31 @@ static int calculate_count(struct radix_tree_root *root, * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. - * @update_node: callback for changing leaf nodes * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void __rcu **slot, void *item, - radix_tree_update_node_t update_node) + void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); - int exceptional = !!radix_tree_exceptional_entry(item) - - !!radix_tree_exceptional_entry(old); + int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* - * This function supports replacing exceptional entries and + * This function supports replacing value entries and * deleting entries, but that needs accounting against the - * node unless the slot is root->rnode. + * node unless the slot is root->xa_head. */ - WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) && - (count || exceptional)); - replace_slot(slot, item, node, count, exceptional); + WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && + (count || values)); + replace_slot(slot, item, node, count, values); if (!node) return; - if (update_node) - update_node(node); - - delete_node(root, node, update_node); + delete_node(root, node); } /** @@ -1211,12 +923,12 @@ void __radix_tree_replace(struct radix_tree_root *root, * @slot: pointer to slot * @item: new item to store in the slot. * - * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), - * regular entries, and exceptional entries, as that requires accounting + * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). @@ -1224,7 +936,7 @@ void __radix_tree_replace(struct radix_tree_root *root, void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { - __radix_tree_replace(root, NULL, slot, item, NULL); + __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); @@ -1234,162 +946,16 @@ EXPORT_SYMBOL(radix_tree_replace_slot); * @slot: pointer to slot * @item: new item to store in the slot. * - * For use with radix_tree_split() and radix_tree_for_each_slot(). - * Caller must hold tree write locked across split and replacement. + * For use with radix_tree_for_each_slot(). + * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { - __radix_tree_replace(root, iter->node, slot, item, NULL); + __radix_tree_replace(root, iter->node, slot, item); } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/** - * radix_tree_join - replace multiple entries with one multiorder entry - * @root: radix tree root - * @index: an index inside the new entry - * @order: order of the new entry - * @item: new entry - * - * Call this function to replace several entries with one larger entry. - * The existing entries are presumed to not need freeing as a result of - * this call. - * - * The replacement entry will have all the tags set on it that were set - * on any of the entries it is replacing. - */ -int radix_tree_join(struct radix_tree_root *root, unsigned long index, - unsigned order, void *item) -{ - struct radix_tree_node *node; - void __rcu **slot; - int error; - - BUG_ON(radix_tree_is_internal_node(item)); - - error = __radix_tree_create(root, index, order, &node, &slot); - if (!error) - error = insert_entries(node, slot, item, order, true); - if (error > 0) - error = 0; - - return error; -} - -/** - * radix_tree_split - Split an entry into smaller entries - * @root: radix tree root - * @index: An index within the large entry - * @order: Order of new entries - * - * Call this function as the first step in replacing a multiorder entry - * with several entries of lower order. After this function returns, - * loop over the relevant portion of the tree using radix_tree_for_each_slot() - * and call radix_tree_iter_replace() to set up each new entry. - * - * The tags from this entry are replicated to all the new entries. - * - * The radix tree should be locked against modification during the entire - * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which - * should prompt RCU walkers to restart the lookup from the root. - */ -int radix_tree_split(struct radix_tree_root *root, unsigned long index, - unsigned order) -{ - struct radix_tree_node *parent, *node, *child; - void __rcu **slot; - unsigned int offset, end; - unsigned n, tag, tags = 0; - gfp_t gfp = root_gfp_mask(root); - - if (!__radix_tree_lookup(root, index, &parent, &slot)) - return -ENOENT; - if (!parent) - return -ENOENT; - - offset = get_slot_offset(parent, slot); - - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tag_get(parent, tag, offset)) - tags |= 1 << tag; - - for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { - if (!is_sibling_entry(parent, - rcu_dereference_raw(parent->slots[end]))) - break; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(parent, tag, end); - /* rcu_assign_pointer ensures tags are set before RETRY */ - rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); - } - rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); - parent->exceptional -= (end - offset); - - if (order == parent->shift) - return 0; - if (order > parent->shift) { - while (offset < end) - offset += insert_entries(parent, &parent->slots[offset], - RADIX_TREE_RETRY, order, true); - return 0; - } - - node = parent; - - for (;;) { - if (node->shift > order) { - child = radix_tree_node_alloc(gfp, node, root, - node->shift - RADIX_TREE_MAP_SHIFT, - offset, 0, 0); - if (!child) - goto nomem; - if (node != parent) { - node->count++; - rcu_assign_pointer(node->slots[offset], - node_to_entry(child)); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - } - - node = child; - offset = 0; - continue; - } - - n = insert_entries(node, &node->slots[offset], - RADIX_TREE_RETRY, order, false); - BUG_ON(n > RADIX_TREE_MAP_SIZE); - - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - offset += n; - - while (offset == RADIX_TREE_MAP_SIZE) { - if (node == parent) - break; - offset = node->offset; - child = node; - node = node->parent; - rcu_assign_pointer(node->slots[offset], - node_to_entry(child)); - offset++; - } - if ((node == parent) && (offset == end)) - return 0; - } - - nomem: - /* Shouldn't happen; did user forget to preload? */ - /* TODO: free all the allocated nodes */ - WARN_ON(1); - return -ENOMEM; -} -#endif - static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) @@ -1447,18 +1013,6 @@ void *radix_tree_tag_set(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_set); -/** - * radix_tree_iter_tag_set - set a tag on the current iterator entry - * @root: radix tree root - * @iter: iterator state - * @tag: tag to set - */ -void radix_tree_iter_tag_set(struct radix_tree_root *root, - const struct radix_tree_iter *iter, unsigned int tag) -{ - node_tag_set(root, iter->node, tag, iter_offset(iter)); -} - static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) @@ -1574,14 +1128,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_get); -static inline void __set_iter_shift(struct radix_tree_iter *iter, - unsigned int shift) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - iter->shift = shift; -#endif -} - /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, @@ -1608,92 +1154,11 @@ static void set_iter_tags(struct radix_tree_iter *iter, } } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -static void __rcu **skip_siblings(struct radix_tree_node **nodep, - void __rcu **slot, struct radix_tree_iter *iter) -{ - while (iter->index < iter->next_index) { - *nodep = rcu_dereference_raw(*slot); - if (*nodep && !is_sibling_entry(iter->node, *nodep)) - return slot; - slot++; - iter->index = __radix_tree_iter_add(iter, 1); - iter->tags >>= 1; - } - - *nodep = NULL; - return NULL; -} - -void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags) -{ - unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; - struct radix_tree_node *node; - - slot = skip_siblings(&node, slot, iter); - - while (radix_tree_is_internal_node(node)) { - unsigned offset; - unsigned long next_index; - - if (node == RADIX_TREE_RETRY) - return slot; - node = entry_to_node(node); - iter->node = node; - iter->shift = node->shift; - - if (flags & RADIX_TREE_ITER_TAGGED) { - offset = radix_tree_find_next_bit(node, tag, 0); - if (offset == RADIX_TREE_MAP_SIZE) - return NULL; - slot = &node->slots[offset]; - iter->index = __radix_tree_iter_add(iter, offset); - set_iter_tags(iter, node, offset, tag); - node = rcu_dereference_raw(*slot); - } else { - offset = 0; - slot = &node->slots[0]; - for (;;) { - node = rcu_dereference_raw(*slot); - if (node) - break; - slot++; - offset++; - if (offset == RADIX_TREE_MAP_SIZE) - return NULL; - } - iter->index = __radix_tree_iter_add(iter, offset); - } - if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) - goto none; - next_index = (iter->index | shift_maxindex(iter->shift)) + 1; - if (next_index < iter->next_index) - iter->next_index = next_index; - } - - return slot; - none: - iter->next_index = 0; - return NULL; -} -EXPORT_SYMBOL(__radix_tree_next_slot); -#else -static void __rcu **skip_siblings(struct radix_tree_node **nodep, - void __rcu **slot, struct radix_tree_iter *iter) -{ - return slot; -} -#endif - void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { - struct radix_tree_node *node; - slot++; iter->index = __radix_tree_iter_add(iter, 1); - skip_siblings(&node, slot, iter); iter->next_index = iter->index; iter->tags = 0; return NULL; @@ -1744,8 +1209,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; - __set_iter_shift(iter, 0); - return (void __rcu **)&root->rnode; + return (void __rcu **)&root->xa_head; } do { @@ -1765,8 +1229,6 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); - if (is_sibling_entry(node, slot)) - continue; if (slot) break; } @@ -1784,13 +1246,12 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, goto restart; if (child == RADIX_TREE_RETRY) break; - } while (radix_tree_is_internal_node(child)); + } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ - iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); + iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; - __set_iter_shift(iter, node->shift); if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); @@ -1846,48 +1307,6 @@ radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, } EXPORT_SYMBOL(radix_tree_gang_lookup); -/** - * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree - * @root: radix tree root - * @results: where the results of the lookup are placed - * @indices: where their indices should be placed (but usually NULL) - * @first_index: start the lookup from this key - * @max_items: place up to this many items at *results - * - * Performs an index-ascending scan of the tree for present items. Places - * their slots at *@results and returns the number of items which were - * placed at *@results. - * - * The implementation is naive. - * - * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must - * be dereferenced with radix_tree_deref_slot, and if using only RCU - * protection, radix_tree_deref_slot may fail requiring a retry. - */ -unsigned int -radix_tree_gang_lookup_slot(const struct radix_tree_root *root, - void __rcu ***results, unsigned long *indices, - unsigned long first_index, unsigned int max_items) -{ - struct radix_tree_iter iter; - void __rcu **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, root, &iter, first_index) { - results[ret] = slot; - if (indices) - indices[ret] = iter.index; - if (++ret == max_items) - break; - } - - return ret; -} -EXPORT_SYMBOL(radix_tree_gang_lookup_slot); - /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag @@ -1964,28 +1383,11 @@ radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -/** - * __radix_tree_delete_node - try to free node after clearing a slot - * @root: radix tree root - * @node: node containing @index - * @update_node: callback for changing leaf nodes - * - * After clearing the slot at @index in @node from radix tree - * rooted at @root, call this function to attempt freeing the - * node and shrinking the tree. - */ -void __radix_tree_delete_node(struct radix_tree_root *root, - struct radix_tree_node *node, - radix_tree_update_node_t update_node) -{ - delete_node(root, node, update_node); -} - static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); - int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0; + int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; @@ -1995,8 +1397,8 @@ static bool __radix_tree_delete(struct radix_tree_root *root, for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); - replace_slot(slot, NULL, node, -1, exceptional); - return node && delete_node(root, node, NULL); + replace_slot(slot, NULL, node, -1, values); + return node && delete_node(root, node); } /** @@ -2068,19 +1470,6 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -void radix_tree_clear_tags(struct radix_tree_root *root, - struct radix_tree_node *node, - void __rcu **slot) -{ - if (node) { - unsigned int tag, offset = get_slot_offset(node, slot); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - node_tag_clear(root, node, tag, offset); - } else { - root_tag_clear_all(root); - } -} - /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root @@ -2106,33 +1495,12 @@ void idr_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(idr_preload); -int ida_pre_get(struct ida *ida, gfp_t gfp) -{ - /* - * The IDA API has no preload_end() equivalent. Instead, - * ida_get_new() can return -EAGAIN, prompting the caller - * to return to the ida_pre_get() step. - */ - if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE)) - preempt_enable(); - - if (!this_cpu_read(ida_bitmap)) { - struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp); - if (!bitmap) - return 0; - if (this_cpu_cmpxchg(ida_bitmap, NULL, bitmap)) - kfree(bitmap); - } - - return 1; -} - void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; - void __rcu **slot = (void __rcu **)&root->rnode; + void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; @@ -2148,8 +1516,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root, if (error < 0) return ERR_PTR(error); shift = error; - child = rcu_dereference_raw(root->rnode); + child = rcu_dereference_raw(root->xa_head); } + if (start == 0 && shift == 0) + shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; @@ -2192,7 +1562,6 @@ void __rcu **idr_get_free(struct radix_tree_root *root, else iter->next_index = 1; iter->node = node; - __set_iter_shift(iter, shift); set_iter_tags(iter, node, offset, IDR_FREE); return slot; @@ -2211,10 +1580,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root, */ void idr_destroy(struct idr *idr) { - struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode); + struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); - idr->idr_rt.rnode = NULL; + idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); @@ -2228,31 +1597,6 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } -static __init unsigned long __maxindex(unsigned int height) -{ - unsigned int width = height * RADIX_TREE_MAP_SHIFT; - int shift = RADIX_TREE_INDEX_BITS - width; - - if (shift < 0) - return ~0UL; - if (shift >= BITS_PER_LONG) - return 0UL; - return ~0UL >> shift; -} - -static __init void radix_tree_init_maxnodes(void) -{ - unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; - unsigned int i, j; - - for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) - height_to_maxindex[i] = __maxindex(i); - for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { - for (j = i; j > 0; j--) - height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; - } -} - static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; @@ -2266,8 +1610,6 @@ static int radix_tree_cpu_dead(unsigned int cpu) kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } - kfree(per_cpu(ida_bitmap, cpu)); - per_cpu(ida_bitmap, cpu) = NULL; return 0; } @@ -2277,11 +1619,11 @@ void __init radix_tree_init(void) BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); + BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); - radix_tree_init_maxnodes(); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 5d73f5cb4d8a..79777645cac9 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) CFLAGS += -I../../../arch/arm/include -mfpu=neon HAS_NEON = yes endif -ifeq ($(ARCH),arm64) +ifeq ($(ARCH),aarch64) CFLAGS += -I../../../arch/arm64/include HAS_NEON = yes endif @@ -41,7 +41,7 @@ ifeq ($(IS_X86),yes) gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) - OBJS += neon.o neon1.o neon2.o neon4.o neon8.o + OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 else HAS_ALTIVEC := $(shell printf '\#include \nvector int a;\n' |\ diff --git a/lib/sg_pool.c b/lib/sg_pool.c index 6dd30615a201..d1c1e6388eaa 100644 --- a/lib/sg_pool.c +++ b/lib/sg_pool.c @@ -148,10 +148,9 @@ static __init int sg_pool_init(void) cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; - if (sgp->pool) - mempool_destroy(sgp->pool); - if (sgp->slab) - kmem_cache_destroy(sgp->slab); + + mempool_destroy(sgp->pool); + kmem_cache_destroy(sgp->slab); } return -ENOMEM; diff --git a/lib/string.c b/lib/string.c index 2c0900a5d51a..38e4ca08e757 100644 --- a/lib/string.c +++ b/lib/string.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 08d3d59dca17..aa22bcaec1dc 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6494,6 +6494,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; skb->vlan_proto = htons(ETH_P_IP); + dev_net_set(&dev, &init_net); skb->dev = &dev; skb->dev->ifindex = SKB_DEV_IFINDEX; skb->dev->type = SKB_DEV_TYPE; diff --git a/lib/test_ida.c b/lib/test_ida.c index 2d1637d8136b..b06880625961 100644 --- a/lib/test_ida.c +++ b/lib/test_ida.c @@ -150,10 +150,10 @@ static void ida_check_conv(struct ida *ida) IDA_BUG_ON(ida, !ida_is_empty(ida)); } +static DEFINE_IDA(ida); + static int ida_checks(void) { - DEFINE_IDA(ida); - IDA_BUG_ON(&ida, !ida_is_empty(&ida)); ida_check_alloc(&ida); ida_check_destroy(&ida); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index ec657105edbf..51b78405bf24 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void) kmem_cache_destroy(cache); } +static noinline void __init kasan_memchr(void) +{ + char *ptr; + size_t size = 24; + + pr_info("out-of-bounds in memchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memchr(ptr, '1', size + 1); + kfree(ptr); +} + +static noinline void __init kasan_memcmp(void) +{ + char *ptr; + size_t size = 24; + int arr[9]; + + pr_info("out-of-bounds in memcmp\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memset(arr, 0, sizeof(arr)); + memcmp(ptr, arr, size+1); + kfree(ptr); +} + +static noinline void __init kasan_strings(void) +{ + char *ptr; + size_t size = 24; + + pr_info("use-after-free in strchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + kfree(ptr); + + /* + * Try to cause only 1 invalid access (less spam in dmesg). + * For that we need ptr to point to zeroed byte. + * Skip metadata that could be stored in freed object so ptr + * will likely point to zeroed byte. + */ + ptr += 16; + strchr(ptr, '1'); + + pr_info("use-after-free in strrchr\n"); + strrchr(ptr, '1'); + + pr_info("use-after-free in strcmp\n"); + strcmp(ptr, "2"); + + pr_info("use-after-free in strncmp\n"); + strncmp(ptr, "2", 1); + + pr_info("use-after-free in strlen\n"); + strlen(ptr); + + pr_info("use-after-free in strnlen\n"); + strnlen(ptr, 1); +} + static int __init kmalloc_tests_init(void) { /* @@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void) use_after_scope_test(); kmem_cache_double_free(); kmem_cache_invalid_free(); + kasan_memchr(); + kasan_memcmp(); + kasan_strings(); kasan_restore_multi_shot(multishot); diff --git a/lib/test_memcat_p.c b/lib/test_memcat_p.c new file mode 100644 index 000000000000..849c477d49d0 --- /dev/null +++ b/lib/test_memcat_p.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for memcat_p() in lib/memcat_p.c + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +struct test_struct { + int num; + unsigned int magic; +}; + +#define MAGIC 0xf00ff00f +/* Size of each of the NULL-terminated input arrays */ +#define INPUT_MAX 128 +/* Expected number of non-NULL elements in the output array */ +#define EXPECT (INPUT_MAX * 2 - 2) + +static int __init test_memcat_p_init(void) +{ + struct test_struct **in0, **in1, **out, **p; + int err = -ENOMEM, i, r, total = 0; + + in0 = kcalloc(INPUT_MAX, sizeof(*in0), GFP_KERNEL); + if (!in0) + return err; + + in1 = kcalloc(INPUT_MAX, sizeof(*in1), GFP_KERNEL); + if (!in1) + goto err_free_in0; + + for (i = 0, r = 1; i < INPUT_MAX - 1; i++) { + in0[i] = kmalloc(sizeof(**in0), GFP_KERNEL); + if (!in0[i]) + goto err_free_elements; + + in1[i] = kmalloc(sizeof(**in1), GFP_KERNEL); + if (!in1[i]) { + kfree(in0[i]); + goto err_free_elements; + } + + /* lifted from test_sort.c */ + r = (r * 725861) % 6599; + in0[i]->num = r; + in1[i]->num = -r; + in0[i]->magic = MAGIC; + in1[i]->magic = MAGIC; + } + + in0[i] = in1[i] = NULL; + + out = memcat_p(in0, in1); + if (!out) + goto err_free_all_elements; + + err = -EINVAL; + for (i = 0, p = out; *p && (i < INPUT_MAX * 2 - 1); p++, i++) { + total += (*p)->num; + + if ((*p)->magic != MAGIC) { + pr_err("test failed: wrong magic at %d: %u\n", i, + (*p)->magic); + goto err_free_out; + } + } + + if (total) { + pr_err("test failed: expected zero total, got %d\n", total); + goto err_free_out; + } + + if (i != EXPECT) { + pr_err("test failed: expected output size %d, got %d\n", + EXPECT, i); + goto err_free_out; + } + + for (i = 0; i < INPUT_MAX - 1; i++) + if (out[i] != in0[i] || out[i + INPUT_MAX - 1] != in1[i]) { + pr_err("test failed: wrong element order at %d\n", i); + goto err_free_out; + } + + err = 0; + pr_info("test passed\n"); + +err_free_out: + kfree(out); +err_free_all_elements: + i = INPUT_MAX; +err_free_elements: + for (i--; i >= 0; i--) { + kfree(in1[i]); + kfree(in0[i]); + } + + kfree(in1); +err_free_in0: + kfree(in0); + + return err; +} + +static void __exit test_memcat_p_exit(void) +{ +} + +module_init(test_memcat_p_init); +module_exit(test_memcat_p_exit); + +MODULE_LICENSE("GPL"); diff --git a/lib/test_xarray.c b/lib/test_xarray.c new file mode 100644 index 000000000000..aa47754150ce --- /dev/null +++ b/lib/test_xarray.c @@ -0,0 +1,1238 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_xarray.c: Test the XArray API + * Copyright (c) 2017-2018 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include +#include + +static unsigned int tests_run; +static unsigned int tests_passed; + +#ifndef XA_DEBUG +# ifdef __KERNEL__ +void xa_dump(const struct xarray *xa) { } +# endif +#undef XA_BUG_ON +#define XA_BUG_ON(xa, x) do { \ + tests_run++; \ + if (x) { \ + printk("BUG at %s:%d\n", __func__, __LINE__); \ + xa_dump(xa); \ + dump_stack(); \ + } else { \ + tests_passed++; \ + } \ +} while (0) +#endif + +static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp); +} + +static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + u32 id = 0; + + XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_value(index & LONG_MAX), + gfp) != 0); + XA_BUG_ON(xa, id != index); +} + +static void xa_erase_index(struct xarray *xa, unsigned long index) +{ + XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX)); + XA_BUG_ON(xa, xa_load(xa, index) != NULL); +} + +/* + * If anyone needs this, please move it to xarray.c. We have no current + * users outside the test suite because all current multislot users want + * to use the advanced API. + */ +static void *xa_store_order(struct xarray *xa, unsigned long index, + unsigned order, void *entry, gfp_t gfp) +{ + XA_STATE_ORDER(xas, xa, index, order); + void *curr; + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return curr; +} + +static noinline void check_xa_err(struct xarray *xa) +{ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0); +#ifndef __KERNEL__ + /* The kernel does not fail GFP_NOWAIT allocations */ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); +#endif + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 1)) != 0); +// kills the test-suite :-( +// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); +} + +static noinline void check_xas_retry(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + void *entry; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_store_index(xa, 1, GFP_KERNEL); + + rcu_read_lock(); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_value(0)); + xa_erase_index(xa, 1); + XA_BUG_ON(xa, !xa_is_retry(xas_reload(&xas))); + XA_BUG_ON(xa, xas_retry(&xas, NULL)); + XA_BUG_ON(xa, xas_retry(&xas, xa_mk_value(0))); + xas_reset(&xas); + XA_BUG_ON(xa, xas.xa_node != XAS_RESTART); + XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_node != NULL); + + XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, !xa_is_internal(xas_reload(&xas))); + xas.xa_node = XAS_RESTART; + XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0)); + rcu_read_unlock(); + + /* Make sure we can iterate through retry entries */ + xas_lock(&xas); + xas_set(&xas, 0); + xas_store(&xas, XA_RETRY_ENTRY); + xas_set(&xas, 1); + xas_store(&xas, XA_RETRY_ENTRY); + + xas_set(&xas, 0); + xas_for_each(&xas, entry, ULONG_MAX) { + xas_store(&xas, xa_mk_value(xas.xa_index)); + } + xas_unlock(&xas); + + xa_erase_index(xa, 0); + xa_erase_index(xa, 1); +} + +static noinline void check_xa_load(struct xarray *xa) +{ + unsigned long i, j; + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j < i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + } + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j >= i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + xa_erase_index(xa, i); + } + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) +{ + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1; + + /* NULL elements have no marks set */ + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* Storing a pointer will not make a mark appear */ + XA_BUG_ON(xa, xa_store_index(xa, index, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + + /* Setting one mark will not set another mark */ + XA_BUG_ON(xa, xa_get_mark(xa, index + 1, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1)); + + /* Storing NULL clears marks, and they can't be set again */ + xa_erase_index(xa, index); + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* + * Storing a multi-index entry over entries with marks gives the + * entire entry the union of the marks + */ + BUG_ON((index % 4) != 0); + for (order = 2; order < max_order; order++) { + unsigned long base = round_down(index, 1UL << order); + unsigned long next = base + (1UL << order); + unsigned long i; + + XA_BUG_ON(xa, xa_store_index(xa, index + 1, GFP_KERNEL)); + xa_set_mark(xa, index + 1, XA_MARK_0); + XA_BUG_ON(xa, xa_store_index(xa, index + 2, GFP_KERNEL)); + xa_set_mark(xa, index + 2, XA_MARK_1); + XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL)); + xa_store_order(xa, index, order, xa_mk_value(index), + GFP_KERNEL); + for (i = base; i < next; i++) { + XA_STATE(xas, xa, i); + unsigned int seen = 0; + void *entry; + + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2)); + + /* We should see two elements in the array */ + xas_for_each(&xas, entry, ULONG_MAX) + seen++; + XA_BUG_ON(xa, seen != 2); + + /* One of which is marked */ + xas_set(&xas, 0); + seen = 0; + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0) + seen++; + XA_BUG_ON(xa, seen != 1); + } + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2)); + xa_erase_index(xa, index); + xa_erase_index(xa, next); + XA_BUG_ON(xa, !xa_empty(xa)); + } + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_xa_mark_2(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long index; + unsigned int count = 0; + void *entry; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_set_mark(xa, 0, XA_MARK_0); + xas_lock(&xas); + xas_load(&xas); + xas_init_marks(&xas); + xas_unlock(&xas); + XA_BUG_ON(xa, !xa_get_mark(xa, 0, XA_MARK_0) == 0); + + for (index = 3500; index < 4500; index++) { + xa_store_index(xa, index, GFP_KERNEL); + xa_set_mark(xa, index, XA_MARK_0); + } + + xas_reset(&xas); + rcu_read_lock(); + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0) + count++; + rcu_read_unlock(); + XA_BUG_ON(xa, count != 1000); + + xas_lock(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + xas_init_marks(&xas); + XA_BUG_ON(xa, !xa_get_mark(xa, xas.xa_index, XA_MARK_0)); + XA_BUG_ON(xa, !xas_get_mark(&xas, XA_MARK_0)); + } + xas_unlock(&xas); + + xa_destroy(xa); +} + +static noinline void check_xa_mark(struct xarray *xa) +{ + unsigned long index; + + for (index = 0; index < 16384; index += 4) + check_xa_mark_1(xa, index); + + check_xa_mark_2(xa); +} + +static noinline void check_xa_shrink(struct xarray *xa) +{ + XA_STATE(xas, xa, 1); + struct xa_node *node; + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 15 : 1; + + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL); + + /* + * Check that erasing the entry at 1 shrinks the tree and properly + * marks the node as being deleted. + */ + xas_lock(&xas); + XA_BUG_ON(xa, xas_load(&xas) != xa_mk_value(1)); + node = xas.xa_node; + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != NULL); + XA_BUG_ON(xa, xas.xa_node != XAS_BOUNDS); + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != XA_RETRY_ENTRY); + XA_BUG_ON(xa, xas_load(&xas) != NULL); + xas_unlock(&xas); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + xa_erase_index(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (order = 0; order < max_order; order++) { + unsigned long max = (1UL << order) - 1; + xa_store_order(xa, 0, order, xa_mk_value(0), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, max) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL); + rcu_read_lock(); + node = xa_head(xa); + rcu_read_unlock(); + XA_BUG_ON(xa, xa_store_index(xa, ULONG_MAX, GFP_KERNEL) != + NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_head(xa) == node); + rcu_read_unlock(); + XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL); + xa_erase_index(xa, ULONG_MAX); + XA_BUG_ON(xa, xa->xa_head != node); + xa_erase_index(xa, 0); + } +} + +static noinline void check_cmpxchg(struct xarray *xa) +{ + void *FIVE = xa_mk_value(5); + void *SIX = xa_mk_value(6); + void *LOTS = xa_mk_value(12345678); + + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EEXIST); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, SIX, FIVE, GFP_KERNEL) != LOTS); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, LOTS, FIVE, GFP_KERNEL) != LOTS); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, FIVE, LOTS, GFP_KERNEL) != FIVE); + XA_BUG_ON(xa, xa_cmpxchg(xa, 5, FIVE, NULL, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_cmpxchg(xa, 5, NULL, FIVE, GFP_KERNEL) != NULL); + xa_erase_index(xa, 12345678); + xa_erase_index(xa, 5); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_reserve(struct xarray *xa) +{ + void *entry; + unsigned long index = 0; + + /* An array with a reserved entry is not empty */ + XA_BUG_ON(xa, !xa_empty(xa)); + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + XA_BUG_ON(xa, xa_load(xa, 12345678)); + xa_release(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Releasing a used entry does nothing */ + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL); + xa_release(xa, 12345678); + xa_erase_index(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* cmpxchg sees a reserved entry as NULL */ + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678), + GFP_NOWAIT) != NULL); + xa_release(xa, 12345678); + xa_erase_index(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Can iterate through a reserved entry */ + xa_store_index(xa, 5, GFP_KERNEL); + xa_reserve(xa, 6, GFP_KERNEL); + xa_store_index(xa, 7, GFP_KERNEL); + + xa_for_each(xa, entry, index, ULONG_MAX, XA_PRESENT) { + XA_BUG_ON(xa, index != 5 && index != 7); + } + xa_destroy(xa); +} + +static noinline void check_xas_erase(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + void *entry; + unsigned long i, j; + + for (i = 0; i < 200; i++) { + for (j = i; j < 2 * i + 17; j++) { + xas_set(&xas, j); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(j)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + } + + xas_set(&xas, ULONG_MAX); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(0)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + xas_lock(&xas); + xas_store(&xas, NULL); + + xas_set(&xas, 0); + j = i; + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_value(j)); + xas_store(&xas, NULL); + j++; + } + xas_unlock(&xas); + XA_BUG_ON(xa, !xa_empty(xa)); + } +} + +#ifdef CONFIG_XARRAY_MULTI +static noinline void check_multi_store_1(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + unsigned long min = index & ~((1UL << order) - 1); + unsigned long max = min + (1UL << order); + + xa_store_order(xa, index, order, xa_mk_value(index), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, max) != NULL); + XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL); + + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(min)) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(min)); + XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(min)); + XA_BUG_ON(xa, xa_load(xa, max) != NULL); + XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL); + + xa_erase_index(xa, min); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_multi_store_2(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + xa_store_order(xa, index, order, xa_mk_value(0), GFP_KERNEL); + + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(1)) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_index != index); + XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1)); + XA_BUG_ON(xa, !xa_empty(xa)); +} +#endif + +static noinline void check_multi_store(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned long i, j, k; + unsigned int max_order = (sizeof(long) == 4) ? 30 : 60; + + /* Loading from any position returns the same value */ + xa_store_order(xa, 0, 1, xa_mk_value(0), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 2); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Storing adjacent to the value does not alter the value */ + xa_store(xa, 3, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 3); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Overwriting multiple indexes works */ + xa_store_order(xa, 0, 2, xa_mk_value(1), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 2) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 3) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 4) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 4); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 4); + rcu_read_unlock(); + + /* We can erase multiple values with a single store */ + xa_store_order(xa, 0, 63, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Even when the first slot is empty but the others aren't */ + xa_store_index(xa, 1, GFP_KERNEL); + xa_store_index(xa, 2, GFP_KERNEL); + xa_store_order(xa, 0, 2, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (i = 0; i < max_order; i++) { + for (j = 0; j < max_order; j++) { + xa_store_order(xa, 0, i, xa_mk_value(i), GFP_KERNEL); + xa_store_order(xa, 0, j, xa_mk_value(j), GFP_KERNEL); + + for (k = 0; k < max_order; k++) { + void *entry = xa_load(xa, (1UL << k) - 1); + if ((i < k) && (j < k)) + XA_BUG_ON(xa, entry != NULL); + else + XA_BUG_ON(xa, entry != xa_mk_value(j)); + } + + xa_erase(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + } + } + + for (i = 0; i < 20; i++) { + check_multi_store_1(xa, 200, i); + check_multi_store_1(xa, 0, i); + check_multi_store_1(xa, (1UL << i) + 1, i); + } + check_multi_store_2(xa, 4095, 9); +#endif +} + +static DEFINE_XARRAY_ALLOC(xa0); + +static noinline void check_xa_alloc(void) +{ + int i; + u32 id; + + /* An empty array should assign 0 to the first alloc */ + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + /* Erasing it should make the array empty again */ + xa_erase_index(&xa0, 0); + XA_BUG_ON(&xa0, !xa_empty(&xa0)); + + /* And it should assign 0 again */ + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + /* The next assigned ID should be 1 */ + xa_alloc_index(&xa0, 1, GFP_KERNEL); + xa_erase_index(&xa0, 1); + + /* Storing a value should mark it used */ + xa_store_index(&xa0, 1, GFP_KERNEL); + xa_alloc_index(&xa0, 2, GFP_KERNEL); + + /* If we then erase 0, it should be free */ + xa_erase_index(&xa0, 0); + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + xa_erase_index(&xa0, 1); + xa_erase_index(&xa0, 2); + + for (i = 1; i < 5000; i++) { + xa_alloc_index(&xa0, i, GFP_KERNEL); + } + + xa_destroy(&xa0); + + id = 0xfffffffeU; + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != 0); + XA_BUG_ON(&xa0, id != 0xfffffffeU); + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != 0); + XA_BUG_ON(&xa0, id != 0xffffffffU); + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != -ENOSPC); + XA_BUG_ON(&xa0, id != 0xffffffffU); + xa_destroy(&xa0); +} + +static noinline void __check_store_iter(struct xarray *xa, unsigned long start, + unsigned int order, unsigned int present) +{ + XA_STATE_ORDER(xas, xa, start, order); + void *entry; + unsigned int count = 0; + +retry: + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, !xa_is_value(entry)); + XA_BUG_ON(xa, entry < xa_mk_value(start)); + XA_BUG_ON(xa, entry > xa_mk_value(start + (1UL << order) - 1)); + count++; + } + xas_store(&xas, xa_mk_value(start)); + xas_unlock(&xas); + if (xas_nomem(&xas, GFP_KERNEL)) { + count = 0; + goto retry; + } + XA_BUG_ON(xa, xas_error(&xas)); + XA_BUG_ON(xa, count != present); + XA_BUG_ON(xa, xa_load(xa, start) != xa_mk_value(start)); + XA_BUG_ON(xa, xa_load(xa, start + (1UL << order) - 1) != + xa_mk_value(start)); + xa_erase_index(xa, start); +} + +static noinline void check_store_iter(struct xarray *xa) +{ + unsigned int i, j; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + + for (i = 0; i < max_order; i++) { + unsigned int min = 1 << i; + unsigned int max = (2 << i) - 1; + __check_store_iter(xa, 0, i, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + __check_store_iter(xa, min, i, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + + xa_store_index(xa, min, GFP_KERNEL); + __check_store_iter(xa, min, i, 1); + XA_BUG_ON(xa, !xa_empty(xa)); + xa_store_index(xa, max, GFP_KERNEL); + __check_store_iter(xa, min, i, 1); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (j = 0; j < min; j++) + xa_store_index(xa, j, GFP_KERNEL); + __check_store_iter(xa, 0, i, min); + XA_BUG_ON(xa, !xa_empty(xa)); + for (j = 0; j < min; j++) + xa_store_index(xa, min + j, GFP_KERNEL); + __check_store_iter(xa, min, i, min); + XA_BUG_ON(xa, !xa_empty(xa)); + } +#ifdef CONFIG_XARRAY_MULTI + xa_store_index(xa, 63, GFP_KERNEL); + xa_store_index(xa, 65, GFP_KERNEL); + __check_store_iter(xa, 64, 2, 1); + xa_erase_index(xa, 63); +#endif + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_multi_find(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned long index; + + xa_store_order(xa, 12, 2, xa_mk_value(12), GFP_KERNEL); + XA_BUG_ON(xa, xa_store_index(xa, 16, GFP_KERNEL) != NULL); + + index = 0; + XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(12)); + XA_BUG_ON(xa, index != 12); + index = 13; + XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(12)); + XA_BUG_ON(xa, (index < 12) || (index >= 16)); + XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(16)); + XA_BUG_ON(xa, index != 16); + + xa_erase_index(xa, 12); + xa_erase_index(xa, 16); + XA_BUG_ON(xa, !xa_empty(xa)); +#endif +} + +static noinline void check_multi_find_2(struct xarray *xa) +{ + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 10 : 1; + unsigned int i, j; + void *entry; + + for (i = 0; i < max_order; i++) { + unsigned long index = 1UL << i; + for (j = 0; j < index; j++) { + XA_STATE(xas, xa, j + index); + xa_store_index(xa, index - 1, GFP_KERNEL); + xa_store_order(xa, index, i, xa_mk_value(index), + GFP_KERNEL); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + xa_erase_index(xa, index); + } + rcu_read_unlock(); + xa_erase_index(xa, index - 1); + XA_BUG_ON(xa, !xa_empty(xa)); + } + } +} + +static noinline void check_find(struct xarray *xa) +{ + unsigned long i, j, k; + + XA_BUG_ON(xa, !xa_empty(xa)); + + /* + * Check xa_find with all pairs between 0 and 99 inclusive, + * starting at every index between 0 and 99 + */ + for (i = 0; i < 100; i++) { + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + xa_set_mark(xa, i, XA_MARK_0); + for (j = 0; j < i; j++) { + XA_BUG_ON(xa, xa_store_index(xa, j, GFP_KERNEL) != + NULL); + xa_set_mark(xa, j, XA_MARK_0); + for (k = 0; k < 100; k++) { + unsigned long index = k; + void *entry = xa_find(xa, &index, ULONG_MAX, + XA_PRESENT); + if (k <= j) + XA_BUG_ON(xa, index != j); + else if (k <= i) + XA_BUG_ON(xa, index != i); + else + XA_BUG_ON(xa, entry != NULL); + + index = k; + entry = xa_find(xa, &index, ULONG_MAX, + XA_MARK_0); + if (k <= j) + XA_BUG_ON(xa, index != j); + else if (k <= i) + XA_BUG_ON(xa, index != i); + else + XA_BUG_ON(xa, entry != NULL); + } + xa_erase_index(xa, j); + XA_BUG_ON(xa, xa_get_mark(xa, j, XA_MARK_0)); + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); + } + xa_erase_index(xa, i); + XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_0)); + } + XA_BUG_ON(xa, !xa_empty(xa)); + check_multi_find(xa); + check_multi_find_2(xa); +} + +/* See find_swap_entry() in mm/shmem.c */ +static noinline unsigned long xa_find_entry(struct xarray *xa, void *item) +{ + XA_STATE(xas, xa, 0); + unsigned int checked = 0; + void *entry; + + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) + continue; + if (entry == item) + break; + checked++; + if ((checked % 4) != 0) + continue; + xas_pause(&xas); + } + rcu_read_unlock(); + + return entry ? xas.xa_index : -1; +} + +static noinline void check_find_entry(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned int order; + unsigned long offset, index; + + for (order = 0; order < 20; order++) { + for (offset = 0; offset < (1UL << (order + 3)); + offset += (1UL << order)) { + for (index = 0; index < (1UL << (order + 5)); + index += (1UL << order)) { + xa_store_order(xa, index, order, + xa_mk_value(index), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, index) != + xa_mk_value(index)); + XA_BUG_ON(xa, xa_find_entry(xa, + xa_mk_value(index)) != index); + } + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + xa_destroy(xa); + } + } +#endif + + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + xa_store_index(xa, ULONG_MAX, GFP_KERNEL); + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + XA_BUG_ON(xa, xa_find_entry(xa, xa_mk_value(LONG_MAX)) != -1); + xa_erase_index(xa, ULONG_MAX); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_move_small(struct xarray *xa, unsigned long idx) +{ + XA_STATE(xas, xa, 0); + unsigned long i; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_store_index(xa, idx, GFP_KERNEL); + + rcu_read_lock(); + for (i = 0; i < idx * 4; i++) { + void *entry = xas_next(&xas); + if (i <= idx) + XA_BUG_ON(xa, xas.xa_node == XAS_RESTART); + XA_BUG_ON(xa, xas.xa_index != i); + if (i == 0 || i == idx) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + } + xas_next(&xas); + XA_BUG_ON(xa, xas.xa_index != i); + + do { + void *entry = xas_prev(&xas); + i--; + if (i <= idx) + XA_BUG_ON(xa, xas.xa_node == XAS_RESTART); + XA_BUG_ON(xa, xas.xa_index != i); + if (i == 0 || i == idx) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + } while (i > 0); + + xas_set(&xas, ULONG_MAX); + XA_BUG_ON(xa, xas_next(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + XA_BUG_ON(xa, xas_next(&xas) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_index != 0); + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + rcu_read_unlock(); + + xa_erase_index(xa, 0); + xa_erase_index(xa, idx); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_move(struct xarray *xa) +{ + XA_STATE(xas, xa, (1 << 16) - 1); + unsigned long i; + + for (i = 0; i < (1 << 16); i++) + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + + rcu_read_lock(); + do { + void *entry = xas_prev(&xas); + i--; + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, i != xas.xa_index); + } while (i != 0); + + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + + do { + void *entry = xas_next(&xas); + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, i != xas.xa_index); + i++; + } while (i < (1 << 16)); + rcu_read_unlock(); + + for (i = (1 << 8); i < (1 << 15); i++) + xa_erase_index(xa, i); + + i = xas.xa_index; + + rcu_read_lock(); + do { + void *entry = xas_prev(&xas); + i--; + if ((i < (1 << 8)) || (i >= (1 << 15))) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + XA_BUG_ON(xa, i != xas.xa_index); + } while (i != 0); + + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + + do { + void *entry = xas_next(&xas); + if ((i < (1 << 8)) || (i >= (1 << 15))) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + XA_BUG_ON(xa, i != xas.xa_index); + i++; + } while (i < (1 << 16)); + rcu_read_unlock(); + + xa_destroy(xa); + + for (i = 0; i < 16; i++) + check_move_small(xa, 1UL << i); + + for (i = 2; i < 16; i++) + check_move_small(xa, (1UL << i) - 1); +} + +static noinline void xa_store_many_order(struct xarray *xa, + unsigned long index, unsigned order) +{ + XA_STATE_ORDER(xas, xa, index, order); + unsigned int i = 0; + + do { + xas_lock(&xas); + XA_BUG_ON(xa, xas_find_conflict(&xas)); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < (1U << order); i++) { + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(index + i))); + xas_next(&xas); + } +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, xas_error(&xas)); +} + +static noinline void check_create_range_1(struct xarray *xa, + unsigned long index, unsigned order) +{ + unsigned long i; + + xa_store_many_order(xa, index, order); + for (i = index; i < index + (1UL << order); i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range_2(struct xarray *xa, unsigned order) +{ + unsigned long i; + unsigned long nr = 1UL << order; + + for (i = 0; i < nr * nr; i += nr) + xa_store_many_order(xa, i, order); + for (i = 0; i < nr * nr; i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range_3(void) +{ + XA_STATE(xas, NULL, 0); + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + XA_BUG_ON(NULL, xas_error(&xas) != -EEXIST); +} + +static noinline void check_create_range_4(struct xarray *xa, + unsigned long index, unsigned order) +{ + XA_STATE_ORDER(xas, xa, index, order); + unsigned long base = xas.xa_index; + unsigned long i = 0; + + xa_store_index(xa, index, GFP_KERNEL); + do { + xas_lock(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < (1UL << order); i++) { + void *old = xas_store(&xas, xa_mk_value(base + i)); + if (xas.xa_index == index) + XA_BUG_ON(xa, old != xa_mk_value(base + i)); + else + XA_BUG_ON(xa, old != NULL); + xas_next(&xas); + } +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, xas_error(&xas)); + + for (i = base; i < base + (1UL << order); i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range(struct xarray *xa) +{ + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 12 : 1; + + for (order = 0; order < max_order; order++) { + check_create_range_1(xa, 0, order); + check_create_range_1(xa, 1U << order, order); + check_create_range_1(xa, 2U << order, order); + check_create_range_1(xa, 3U << order, order); + check_create_range_1(xa, 1U << 24, order); + if (order < 10) + check_create_range_2(xa, order); + + check_create_range_4(xa, 0, order); + check_create_range_4(xa, 1U << order, order); + check_create_range_4(xa, 2U << order, order); + check_create_range_4(xa, 3U << order, order); + check_create_range_4(xa, 1U << 24, order); + + check_create_range_4(xa, 1, order); + check_create_range_4(xa, (1U << order) + 1, order); + check_create_range_4(xa, (2U << order) + 1, order); + check_create_range_4(xa, (2U << order) - 1, order); + check_create_range_4(xa, (3U << order) + 1, order); + check_create_range_4(xa, (3U << order) - 1, order); + check_create_range_4(xa, (1U << 24) + 1, order); + } + + check_create_range_3(); +} + +static noinline void __check_store_range(struct xarray *xa, unsigned long first, + unsigned long last) +{ +#ifdef CONFIG_XARRAY_MULTI + xa_store_range(xa, first, last, xa_mk_value(first), GFP_KERNEL); + + XA_BUG_ON(xa, xa_load(xa, first) != xa_mk_value(first)); + XA_BUG_ON(xa, xa_load(xa, last) != xa_mk_value(first)); + XA_BUG_ON(xa, xa_load(xa, first - 1) != NULL); + XA_BUG_ON(xa, xa_load(xa, last + 1) != NULL); + + xa_store_range(xa, first, last, NULL, GFP_KERNEL); +#endif + + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_store_range(struct xarray *xa) +{ + unsigned long i, j; + + for (i = 0; i < 128; i++) { + for (j = i; j < 128; j++) { + __check_store_range(xa, i, j); + __check_store_range(xa, 128 + i, 128 + j); + __check_store_range(xa, 4095 + i, 4095 + j); + __check_store_range(xa, 4096 + i, 4096 + j); + __check_store_range(xa, 123456 + i, 123456 + j); + __check_store_range(xa, UINT_MAX + i, UINT_MAX + j); + } + } +} + +static LIST_HEAD(shadow_nodes); + +static void test_update_node(struct xa_node *node) +{ + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) + list_add(&shadow_nodes, &node->private_list); + } else { + if (!list_empty(&node->private_list)) + list_del_init(&node->private_list); + } +} + +static noinline void shadow_remove(struct xarray *xa) +{ + struct xa_node *node; + + xa_lock(xa); + while ((node = list_first_entry_or_null(&shadow_nodes, + struct xa_node, private_list))) { + XA_STATE(xas, node->array, 0); + XA_BUG_ON(xa, node->array != xa); + list_del_init(&node->private_list); + xas.xa_node = xa_parent_locked(node->array, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, test_update_node); + xas_store(&xas, NULL); + } + xa_unlock(xa); +} + +static noinline void check_workingset(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + xas_set_update(&xas, test_update_node); + + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(0)); + xas_next(&xas); + xas_store(&xas, xa_mk_value(1)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, list_empty(&shadow_nodes)); + + xas_lock(&xas); + xas_next(&xas); + xas_store(&xas, &xas); + XA_BUG_ON(xa, !list_empty(&shadow_nodes)); + + xas_store(&xas, xa_mk_value(2)); + xas_unlock(&xas); + XA_BUG_ON(xa, list_empty(&shadow_nodes)); + + shadow_remove(xa); + XA_BUG_ON(xa, !list_empty(&shadow_nodes)); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +/* + * Check that the pointer / value / sibling entries are accounted the + * way we expect them to be. + */ +static noinline void check_account(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned int order; + + for (order = 1; order < 12; order++) { + XA_STATE(xas, xa, 1 << order); + + xa_store_order(xa, 0, order, xa, GFP_KERNEL); + xas_load(&xas); + XA_BUG_ON(xa, xas.xa_node->count == 0); + XA_BUG_ON(xa, xas.xa_node->count > (1 << order)); + XA_BUG_ON(xa, xas.xa_node->nr_values != 0); + + xa_store_order(xa, 1 << order, order, xa_mk_value(1 << order), + GFP_KERNEL); + XA_BUG_ON(xa, xas.xa_node->count != xas.xa_node->nr_values * 2); + + xa_erase(xa, 1 << order); + XA_BUG_ON(xa, xas.xa_node->nr_values != 0); + + xa_erase(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + } +#endif +} + +static noinline void check_destroy(struct xarray *xa) +{ + unsigned long index; + + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Destroying an empty array is a no-op */ + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Destroying an array with a single entry */ + for (index = 0; index < 1000; index++) { + xa_store_index(xa, index, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + } + + /* Destroying an array with a single entry at ULONG_MAX */ + xa_store(xa, ULONG_MAX, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + +#ifdef CONFIG_XARRAY_MULTI + /* Destroying an array with a multi-index entry */ + xa_store_order(xa, 1 << 11, 11, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); +#endif +} + +static DEFINE_XARRAY(array); + +static int xarray_checks(void) +{ + check_xa_err(&array); + check_xas_retry(&array); + check_xa_load(&array); + check_xa_mark(&array); + check_xa_shrink(&array); + check_xas_erase(&array); + check_cmpxchg(&array); + check_reserve(&array); + check_multi_store(&array); + check_xa_alloc(); + check_find(&array); + check_find_entry(&array); + check_account(&array); + check_destroy(&array); + check_move(&array); + check_create_range(&array); + check_store_range(&array); + check_store_iter(&array); + + check_workingset(&array, 0); + check_workingset(&array, 64); + check_workingset(&array, 4096); + + printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); + return (tests_run == tests_passed) ? 0 : -EINVAL; +} + +static void xarray_exit(void) +{ +} + +module_init(xarray_checks); +module_exit(xarray_exit); +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d5b3a3f95c01..37a54a6dd594 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -612,6 +612,109 @@ char *string(char *buf, char *end, const char *s, struct printf_spec spec) return widen_string(buf, len, end, spec); } +static noinline_for_stack +char *pointer_string(char *buf, char *end, const void *ptr, + struct printf_spec spec) +{ + spec.base = 16; + spec.flags |= SMALL; + if (spec.field_width == -1) { + spec.field_width = 2 * sizeof(ptr); + spec.flags |= ZEROPAD; + } + + return number(buf, end, (unsigned long int)ptr, spec); +} + +/* Make pointers available for printing early in the boot sequence. */ +static int debug_boot_weak_hash __ro_after_init; + +static int __init debug_boot_weak_hash_enable(char *str) +{ + debug_boot_weak_hash = 1; + pr_info("debug_boot_weak_hash enabled\n"); + return 0; +} +early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); + +static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key); +static siphash_key_t ptr_key __read_mostly; + +static void enable_ptr_key_workfn(struct work_struct *work) +{ + get_random_bytes(&ptr_key, sizeof(ptr_key)); + /* Needs to run from preemptible context */ + static_branch_disable(¬_filled_random_ptr_key); +} + +static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn); + +static void fill_random_ptr_key(struct random_ready_callback *unused) +{ + /* This may be in an interrupt handler. */ + queue_work(system_unbound_wq, &enable_ptr_key_work); +} + +static struct random_ready_callback random_ready = { + .func = fill_random_ptr_key +}; + +static int __init initialize_ptr_random(void) +{ + int key_size = sizeof(ptr_key); + int ret; + + /* Use hw RNG if available. */ + if (get_random_bytes_arch(&ptr_key, key_size) == key_size) { + static_branch_disable(¬_filled_random_ptr_key); + return 0; + } + + ret = add_random_ready_callback(&random_ready); + if (!ret) { + return 0; + } else if (ret == -EALREADY) { + /* This is in preemptible context */ + enable_ptr_key_workfn(&enable_ptr_key_work); + return 0; + } + + return ret; +} +early_initcall(initialize_ptr_random); + +/* Maps a pointer to a 32 bit unique identifier. */ +static char *ptr_to_id(char *buf, char *end, const void *ptr, + struct printf_spec spec) +{ + const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; + unsigned long hashval; + + /* When debugging early boot use non-cryptographically secure hash. */ + if (unlikely(debug_boot_weak_hash)) { + hashval = hash_long((unsigned long)ptr, 32); + return pointer_string(buf, end, (const void *)hashval, spec); + } + + if (static_branch_unlikely(¬_filled_random_ptr_key)) { + spec.field_width = 2 * sizeof(ptr); + /* string length must be less than default_width */ + return string(buf, end, str, spec); + } + +#ifdef CONFIG_64BIT + hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); + /* + * Mask off the first 32 bits, this makes explicit that we have + * modified the address (and 32 bits is plenty for a unique ID). + */ + hashval = hashval & 0xffffffff; +#else + hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); +#endif + return pointer_string(buf, end, (const void *)hashval, spec); +} + static noinline_for_stack char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, const char *fmt) @@ -1357,20 +1460,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr, return string(buf, end, uuid, spec); } -static noinline_for_stack -char *pointer_string(char *buf, char *end, const void *ptr, - struct printf_spec spec) -{ - spec.base = 16; - spec.flags |= SMALL; - if (spec.field_width == -1) { - spec.field_width = 2 * sizeof(ptr); - spec.flags |= ZEROPAD; - } - - return number(buf, end, (unsigned long int)ptr, spec); -} - int kptr_restrict __read_mostly; static noinline_for_stack @@ -1421,7 +1510,8 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, } static noinline_for_stack -char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt) +char *netdev_bits(char *buf, char *end, const void *addr, + struct printf_spec spec, const char *fmt) { unsigned long long num; int size; @@ -1432,9 +1522,7 @@ char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt) size = sizeof(netdev_features_t); break; default: - num = (unsigned long)addr; - size = sizeof(unsigned long); - break; + return ptr_to_id(buf, end, addr, spec); } return special_hex_number(buf, end, num, size); @@ -1474,7 +1562,7 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, #ifdef CONFIG_COMMON_CLK return string(buf, end, __clk_get_name(clk), spec); #else - return special_hex_number(buf, end, (unsigned long)clk, sizeof(unsigned long)); + return ptr_to_id(buf, end, clk, spec); #endif } } @@ -1596,6 +1684,7 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { + int precision; if (pass) { if (buf < end) *buf = ':'; @@ -1607,7 +1696,11 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, buf = device_node_gen_full_name(dn, buf, end); break; case 'n': /* name */ - buf = string(buf, end, dn->name, str_spec); + p = kbasename(of_node_full_name(dn)); + precision = str_spec.precision; + str_spec.precision = strchrnul(p, '@') - p; + buf = string(buf, end, p, str_spec); + str_spec.precision = precision; break; case 'p': /* phandle */ buf = number(buf, end, (unsigned int)dn->phandle, num_spec); @@ -1651,94 +1744,6 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, return widen_string(buf, buf - buf_start, end, spec); } -/* Make pointers available for printing early in the boot sequence. */ -static int debug_boot_weak_hash __ro_after_init; - -static int __init debug_boot_weak_hash_enable(char *str) -{ - debug_boot_weak_hash = 1; - pr_info("debug_boot_weak_hash enabled\n"); - return 0; -} -early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); - -static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key); -static siphash_key_t ptr_key __read_mostly; - -static void enable_ptr_key_workfn(struct work_struct *work) -{ - get_random_bytes(&ptr_key, sizeof(ptr_key)); - /* Needs to run from preemptible context */ - static_branch_disable(¬_filled_random_ptr_key); -} - -static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn); - -static void fill_random_ptr_key(struct random_ready_callback *unused) -{ - /* This may be in an interrupt handler. */ - queue_work(system_unbound_wq, &enable_ptr_key_work); -} - -static struct random_ready_callback random_ready = { - .func = fill_random_ptr_key -}; - -static int __init initialize_ptr_random(void) -{ - int key_size = sizeof(ptr_key); - int ret; - - /* Use hw RNG if available. */ - if (get_random_bytes_arch(&ptr_key, key_size) == key_size) { - static_branch_disable(¬_filled_random_ptr_key); - return 0; - } - - ret = add_random_ready_callback(&random_ready); - if (!ret) { - return 0; - } else if (ret == -EALREADY) { - /* This is in preemptible context */ - enable_ptr_key_workfn(&enable_ptr_key_work); - return 0; - } - - return ret; -} -early_initcall(initialize_ptr_random); - -/* Maps a pointer to a 32 bit unique identifier. */ -static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec) -{ - const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; - unsigned long hashval; - - /* When debugging early boot use non-cryptographically secure hash. */ - if (unlikely(debug_boot_weak_hash)) { - hashval = hash_long((unsigned long)ptr, 32); - return pointer_string(buf, end, (const void *)hashval, spec); - } - - if (static_branch_unlikely(¬_filled_random_ptr_key)) { - spec.field_width = 2 * sizeof(ptr); - /* string length must be less than default_width */ - return string(buf, end, str, spec); - } - -#ifdef CONFIG_64BIT - hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); - /* - * Mask off the first 32 bits, this makes explicit that we have - * modified the address (and 32 bits is plenty for a unique ID). - */ - hashval = hashval & 0xffffffff; -#else - hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); -#endif - return pointer_string(buf, end, (const void *)hashval, spec); -} - /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -1833,17 +1838,15 @@ static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec) * p page flags (see struct page) given as pointer to unsigned long * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t * v vma flags (VM_*) given as pointer to unsigned long - * - 'O' For a kobject based struct. Must be one of the following: - * - 'OF[fnpPcCF]' For a device tree object - * Without any optional arguments prints the full_name - * f device node full_name - * n device node name - * p device node phandle - * P device node path spec (name + @unit) - * F device node flags - * c major compatible string - * C full compatible string - * + * - 'OF[fnpPcCF]' For a device tree object + * Without any optional arguments prints the full_name + * f device node full_name + * n device node name + * p device node phandle + * P device node path spec (name + @unit) + * F device node flags + * c major compatible string + * C full compatible string * - 'x' For printing the address. Equivalent to "%lx". * * ** When making changes please also update: @@ -1944,7 +1947,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, break; return restricted_pointer(buf, end, ptr, spec); case 'N': - return netdev_bits(buf, end, ptr, fmt); + return netdev_bits(buf, end, ptr, spec, fmt); case 'a': return address_val(buf, end, ptr, fmt); case 'd': @@ -2794,7 +2797,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) copy = end - str; memcpy(str, args, copy); str += len; - args += len; + args += len + 1; } } if (process) diff --git a/lib/xarray.c b/lib/xarray.c new file mode 100644 index 000000000000..8b176f009c08 --- /dev/null +++ b/lib/xarray.c @@ -0,0 +1,2036 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XArray implementation + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include +#include +#include +#include +#include + +/* + * Coding conventions in this file: + * + * @xa is used to refer to the entire xarray. + * @xas is the 'xarray operation state'. It may be either a pointer to + * an xa_state, or an xa_state stored on the stack. This is an unfortunate + * ambiguity. + * @index is the index of the entry being operated on + * @mark is an xa_mark_t; a small number indicating one of the mark bits. + * @node refers to an xa_node; usually the primary one being operated on by + * this function. + * @offset is the index into the slots array inside an xa_node. + * @parent refers to the @xa_node closer to the head than @node. + * @entry refers to something stored in a slot in the xarray + */ + +static inline unsigned int xa_lock_type(const struct xarray *xa) +{ + return (__force unsigned int)xa->xa_flags & 3; +} + +static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_lock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_lock_bh(xas); + else + xas_lock(xas); +} + +static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_unlock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_unlock_bh(xas); + else + xas_unlock(xas); +} + +static inline bool xa_track_free(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_TRACK_FREE; +} + +static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) +{ + if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) + xa->xa_flags |= XA_FLAGS_MARK(mark); +} + +static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) +{ + if (xa->xa_flags & XA_FLAGS_MARK(mark)) + xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); +} + +static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) +{ + return node->marks[(__force unsigned)mark]; +} + +static inline bool node_get_mark(struct xa_node *node, + unsigned int offset, xa_mark_t mark) +{ + return test_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_set_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_set_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_clear_bit(offset, node_marks(node, mark)); +} + +static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) +{ + return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); +} + +static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) +{ + bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); +} + +#define mark_inc(mark) do { \ + mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ +} while (0) + +/* + * xas_squash_marks() - Merge all marks to the first entry + * @xas: Array operation state. + * + * Set a mark on the first entry if any entry has it set. Clear marks on + * all sibling entries. + */ +static void xas_squash_marks(const struct xa_state *xas) +{ + unsigned int mark = 0; + unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; + + if (!xas->xa_sibs) + return; + + do { + unsigned long *marks = xas->xa_node->marks[mark]; + if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) + continue; + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } while (mark++ != (__force unsigned)XA_MARK_MAX); +} + +/* extracts the offset within this node from the index */ +static unsigned int get_offset(unsigned long index, struct xa_node *node) +{ + return (index >> node->shift) & XA_CHUNK_MASK; +} + +static void xas_set_offset(struct xa_state *xas) +{ + xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); +} + +/* move the index either forwards (find) or backwards (sibling slot) */ +static void xas_move_index(struct xa_state *xas, unsigned long offset) +{ + unsigned int shift = xas->xa_node->shift; + xas->xa_index &= ~XA_CHUNK_MASK << shift; + xas->xa_index += offset << shift; +} + +static void xas_advance(struct xa_state *xas) +{ + xas->xa_offset++; + xas_move_index(xas, xas->xa_offset); +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +/* + * Starts a walk. If the @xas is already valid, we assume that it's on + * the right path and just return where we've got to. If we're in an + * error state, return NULL. If the index is outside the current scope + * of the xarray, return NULL without changing @xas->xa_node. Otherwise + * set @xas->xa_node to NULL and return the current head of the array. + */ +static void *xas_start(struct xa_state *xas) +{ + void *entry; + + if (xas_valid(xas)) + return xas_reload(xas); + if (xas_error(xas)) + return NULL; + + entry = xa_head(xas->xa); + if (!xa_is_node(entry)) { + if (xas->xa_index) + return set_bounds(xas); + } else { + if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) + return set_bounds(xas); + } + + xas->xa_node = NULL; + return entry; +} + +static void *xas_descend(struct xa_state *xas, struct xa_node *node) +{ + unsigned int offset = get_offset(xas->xa_index, node); + void *entry = xa_entry(xas->xa, node, offset); + + xas->xa_node = node; + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = xa_entry(xas->xa, node, offset); + } + + xas->xa_offset = offset; + return entry; +} + +/** + * xas_load() - Load an entry from the XArray (advanced). + * @xas: XArray operation state. + * + * Usually walks the @xas to the appropriate state to load the entry + * stored at xa_index. However, it will do nothing and return %NULL if + * @xas is in an error state. xas_load() will never expand the tree. + * + * If the xa_state is set up to operate on a multi-index entry, xas_load() + * may return %NULL or an internal entry, even if there are entries + * present within the range specified by @xas. + * + * Context: Any context. The caller should hold the xa_lock or the RCU lock. + * Return: Usually an entry in the XArray, but see description for exceptions. + */ +void *xas_load(struct xa_state *xas) +{ + void *entry = xas_start(xas); + + while (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + + if (xas->xa_shift > node->shift) + break; + entry = xas_descend(xas, node); + } + return entry; +} +EXPORT_SYMBOL_GPL(xas_load); + +/* Move the radix tree node cache here */ +extern struct kmem_cache *radix_tree_node_cachep; +extern void radix_tree_node_rcu_free(struct rcu_head *head); + +#define XA_RCU_FREE ((struct xarray *)1) + +static void xa_node_free(struct xa_node *node) +{ + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->array = XA_RCU_FREE; + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); +} + +/* + * xas_destroy() - Free any resources allocated during the XArray operation. + * @xas: XArray operation state. + * + * This function is now internal-only. + */ +static void xas_destroy(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_alloc; + + if (!node) + return; + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + kmem_cache_free(radix_tree_node_cachep, node); + xas->xa_alloc = NULL; +} + +/** + * xas_nomem() - Allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * If we need to add new nodes to the XArray, we try to allocate memory + * with GFP_NOWAIT while holding the lock, which will usually succeed. + * If it fails, @xas is flagged as needing memory to continue. The caller + * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, + * the caller should retry the operation. + * + * Forward progress is guaranteed as one node is allocated here and + * stored in the xa_state where it will be found by xas_alloc(). More + * nodes will likely be found in the slab allocator, but we do not tie + * them up here. + * + * Return: true if memory was needed, and was successfully allocated. + */ +bool xas_nomem(struct xa_state *xas, gfp_t gfp) +{ + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} +EXPORT_SYMBOL_GPL(xas_nomem); + +/* + * __xas_nomem() - Drop locks and allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * Internal variant of xas_nomem(). + * + * Return: true if memory was needed, and was successfully allocated. + */ +static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) + __must_hold(xas->xa->xa_lock) +{ + unsigned int lock_type = xa_lock_type(xas->xa); + + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + if (gfpflags_allow_blocking(gfp)) { + xas_unlock_type(xas, lock_type); + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas_lock_type(xas, lock_type); + } else { + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + } + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} + +static void xas_update(struct xa_state *xas, struct xa_node *node) +{ + if (xas->xa_update) + xas->xa_update(node); + else + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); +} + +static void *xas_alloc(struct xa_state *xas, unsigned int shift) +{ + struct xa_node *parent = xas->xa_node; + struct xa_node *node = xas->xa_alloc; + + if (xas_invalid(xas)) + return NULL; + + if (node) { + xas->xa_alloc = NULL; + } else { + node = kmem_cache_alloc(radix_tree_node_cachep, + GFP_NOWAIT | __GFP_NOWARN); + if (!node) { + xas_set_err(xas, -ENOMEM); + return NULL; + } + } + + if (parent) { + node->offset = xas->xa_offset; + parent->count++; + XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); + xas_update(xas, parent); + } + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->shift = shift; + node->count = 0; + node->nr_values = 0; + RCU_INIT_POINTER(node->parent, xas->xa_node); + node->array = xas->xa; + + return node; +} + +#ifdef CONFIG_XARRAY_MULTI +/* Returns the number of indices covered by a given xa_state */ +static unsigned long xas_size(const struct xa_state *xas) +{ + return (xas->xa_sibs + 1UL) << xas->xa_shift; +} +#endif + +/* + * Use this to calculate the maximum index that will need to be created + * in order to add the entry described by @xas. Because we cannot store a + * multiple-index entry at index 0, the calculation is a little more complex + * than you might expect. + */ +static unsigned long xas_max(struct xa_state *xas) +{ + unsigned long max = xas->xa_index; + +#ifdef CONFIG_XARRAY_MULTI + if (xas->xa_shift || xas->xa_sibs) { + unsigned long mask = xas_size(xas) - 1; + max |= mask; + if (mask == max) + max++; + } +#endif + + return max; +} + +/* The maximum index that can be contained in the array without expanding it */ +static unsigned long max_index(void *entry) +{ + if (!xa_is_node(entry)) + return 0; + return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; +} + +static void xas_shrink(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = xas->xa_node; + + for (;;) { + void *entry; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count != 1) + break; + entry = xa_entry_locked(xa, node, 0); + if (!entry) + break; + if (!xa_is_node(entry) && node->shift) + break; + xas->xa_node = XAS_BOUNDS; + + RCU_INIT_POINTER(xa->xa_head, entry); + if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) + xa_mark_clear(xa, XA_FREE_MARK); + + node->count = 0; + node->nr_values = 0; + if (!xa_is_node(entry)) + RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); + xas_update(xas, node); + xa_node_free(node); + if (!xa_is_node(entry)) + break; + node = xa_to_node(entry); + node->parent = NULL; + } +} + +/* + * xas_delete_node() - Attempt to delete an xa_node + * @xas: Array operation state. + * + * Attempts to delete the @xas->xa_node. This will fail if xa->node has + * a non-zero reference count. + */ +static void xas_delete_node(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + for (;;) { + struct xa_node *parent; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count) + break; + + parent = xa_parent_locked(xas->xa, node); + xas->xa_node = parent; + xas->xa_offset = node->offset; + xa_node_free(node); + + if (!parent) { + xas->xa->xa_head = NULL; + xas->xa_node = XAS_BOUNDS; + return; + } + + parent->slots[xas->xa_offset] = NULL; + parent->count--; + XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); + node = parent; + xas_update(xas, node); + } + + if (!node->parent) + xas_shrink(xas); +} + +/** + * xas_free_nodes() - Free this node and all nodes that it references + * @xas: Array operation state. + * @top: Node to free + * + * This node has been removed from the tree. We must now free it and all + * of its subnodes. There may be RCU walkers with references into the tree, + * so we must replace all entries with retry markers. + */ +static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) +{ + unsigned int offset = 0; + struct xa_node *node = top; + + for (;;) { + void *entry = xa_entry_locked(xas->xa, node, offset); + + if (xa_is_node(entry)) { + node = xa_to_node(entry); + offset = 0; + continue; + } + if (entry) + RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); + offset++; + while (offset == XA_CHUNK_SIZE) { + struct xa_node *parent; + + parent = xa_parent_locked(xas->xa, node); + offset = node->offset + 1; + node->count = 0; + node->nr_values = 0; + xas_update(xas, node); + xa_node_free(node); + if (node == top) + return; + node = parent; + } + } +} + +/* + * xas_expand adds nodes to the head of the tree until it has reached + * sufficient height to be able to contain @xas->xa_index + */ +static int xas_expand(struct xa_state *xas, void *head) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = NULL; + unsigned int shift = 0; + unsigned long max = xas_max(xas); + + if (!head) { + if (max == 0) + return 0; + while ((max >> shift) >= XA_CHUNK_SIZE) + shift += XA_CHUNK_SHIFT; + return shift + XA_CHUNK_SHIFT; + } else if (xa_is_node(head)) { + node = xa_to_node(head); + shift = node->shift + XA_CHUNK_SHIFT; + } + xas->xa_node = NULL; + + while (max > max_index(head)) { + xa_mark_t mark = 0; + + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + node = xas_alloc(xas, shift); + if (!node) + return -ENOMEM; + + node->count = 1; + if (xa_is_value(head)) + node->nr_values = 1; + RCU_INIT_POINTER(node->slots[0], head); + + /* Propagate the aggregated mark info to the new child */ + for (;;) { + if (xa_track_free(xa) && mark == XA_FREE_MARK) { + node_mark_all(node, XA_FREE_MARK); + if (!xa_marked(xa, XA_FREE_MARK)) { + node_clear_mark(node, 0, XA_FREE_MARK); + xa_mark_set(xa, XA_FREE_MARK); + } + } else if (xa_marked(xa, mark)) { + node_set_mark(node, 0, mark); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + /* + * Now that the new node is fully initialised, we can add + * it to the tree + */ + if (xa_is_node(head)) { + xa_to_node(head)->offset = 0; + rcu_assign_pointer(xa_to_node(head)->parent, node); + } + head = xa_mk_node(node); + rcu_assign_pointer(xa->xa_head, head); + xas_update(xas, node); + + shift += XA_CHUNK_SHIFT; + } + + xas->xa_node = node; + return shift; +} + +/* + * xas_create() - Create a slot to store an entry in. + * @xas: XArray operation state. + * + * Most users will not need to call this function directly, as it is called + * by xas_store(). It is useful for doing conditional store operations + * (see the xa_cmpxchg() implementation for an example). + * + * Return: If the slot already existed, returns the contents of this slot. + * If the slot was newly created, returns NULL. If it failed to create the + * slot, returns NULL and indicates the error in @xas. + */ +static void *xas_create(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + void *entry; + void __rcu **slot; + struct xa_node *node = xas->xa_node; + int shift; + unsigned int order = xas->xa_shift; + + if (xas_top(node)) { + entry = xa_head_locked(xa); + xas->xa_node = NULL; + shift = xas_expand(xas, entry); + if (shift < 0) + return NULL; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } else if (xas_error(xas)) { + return NULL; + } else if (node) { + unsigned int offset = xas->xa_offset; + + shift = node->shift; + entry = xa_entry_locked(xa, node, offset); + slot = &node->slots[offset]; + } else { + shift = 0; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } + + while (shift > order) { + shift -= XA_CHUNK_SHIFT; + if (!entry) { + node = xas_alloc(xas, shift); + if (!node) + break; + if (xa_track_free(xa)) + node_mark_all(node, XA_FREE_MARK); + rcu_assign_pointer(*slot, xa_mk_node(node)); + } else if (xa_is_node(entry)) { + node = xa_to_node(entry); + } else { + break; + } + entry = xas_descend(xas, node); + slot = &node->slots[xas->xa_offset]; + } + + return entry; +} + +/** + * xas_create_range() - Ensure that stores to this range will succeed + * @xas: XArray operation state. + * + * Creates all of the slots in the range covered by @xas. Sets @xas to + * create single-index entries and positions it at the beginning of the + * range. This is for the benefit of users which have not yet been + * converted to use multi-index entries. + */ +void xas_create_range(struct xa_state *xas) +{ + unsigned long index = xas->xa_index; + unsigned char shift = xas->xa_shift; + unsigned char sibs = xas->xa_sibs; + + xas->xa_index |= ((sibs + 1) << shift) - 1; + if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) + xas->xa_offset |= sibs; + xas->xa_shift = 0; + xas->xa_sibs = 0; + + for (;;) { + xas_create(xas); + if (xas_error(xas)) + goto restore; + if (xas->xa_index <= (index | XA_CHUNK_MASK)) + goto success; + xas->xa_index -= XA_CHUNK_SIZE; + + for (;;) { + struct xa_node *node = xas->xa_node; + xas->xa_node = xa_parent_locked(xas->xa, node); + xas->xa_offset = node->offset - 1; + if (node->offset != 0) + break; + } + } + +restore: + xas->xa_shift = shift; + xas->xa_sibs = sibs; + xas->xa_index = index; + return; +success: + xas->xa_index = index; + if (xas->xa_node) + xas_set_offset(xas); +} +EXPORT_SYMBOL_GPL(xas_create_range); + +static void update_node(struct xa_state *xas, struct xa_node *node, + int count, int values) +{ + if (!node || (!count && !values)) + return; + + node->count += count; + node->nr_values += values; + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); + xas_update(xas, node); + if (count < 0) + xas_delete_node(xas); +} + +/** + * xas_store() - Store this entry in the XArray. + * @xas: XArray operation state. + * @entry: New entry. + * + * If @xas is operating on a multi-index entry, the entry returned by this + * function is essentially meaningless (it may be an internal entry or it + * may be %NULL, even if there are non-NULL entries at some of the indices + * covered by the range). This is not a problem for any current users, + * and can be changed if needed. + * + * Return: The old entry at this index. + */ +void *xas_store(struct xa_state *xas, void *entry) +{ + struct xa_node *node; + void __rcu **slot = &xas->xa->xa_head; + unsigned int offset, max; + int count = 0; + int values = 0; + void *first, *next; + bool value = xa_is_value(entry); + + if (entry) + first = xas_create(xas); + else + first = xas_load(xas); + + if (xas_invalid(xas)) + return first; + node = xas->xa_node; + if (node && (xas->xa_shift < node->shift)) + xas->xa_sibs = 0; + if ((first == entry) && !xas->xa_sibs) + return first; + + next = first; + offset = xas->xa_offset; + max = xas->xa_offset + xas->xa_sibs; + if (node) { + slot = &node->slots[offset]; + if (xas->xa_sibs) + xas_squash_marks(xas); + } + if (!entry) + xas_init_marks(xas); + + for (;;) { + /* + * Must clear the marks before setting the entry to NULL, + * otherwise xas_for_each_marked may find a NULL entry and + * stop early. rcu_assign_pointer contains a release barrier + * so the mark clearing will appear to happen before the + * entry is set to NULL. + */ + rcu_assign_pointer(*slot, entry); + if (xa_is_node(next)) + xas_free_nodes(xas, xa_to_node(next)); + if (!node) + break; + count += !next - !entry; + values += !xa_is_value(first) - !value; + if (entry) { + if (offset == max) + break; + if (!xa_is_sibling(entry)) + entry = xa_mk_sibling(xas->xa_offset); + } else { + if (offset == XA_CHUNK_MASK) + break; + } + next = xa_entry_locked(xas->xa, node, ++offset); + if (!xa_is_sibling(next)) { + if (!entry && (offset > max)) + break; + first = next; + } + slot++; + } + + update_node(xas, node, count, values); + return first; +} +EXPORT_SYMBOL_GPL(xas_store); + +/** + * xas_get_mark() - Returns the state of this mark. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Return: true if the mark is set, false if the mark is clear or @xas + * is in an error state. + */ +bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) +{ + if (xas_invalid(xas)) + return false; + if (!xas->xa_node) + return xa_marked(xas->xa, mark); + return node_get_mark(xas->xa_node, xas->xa_offset, mark); +} +EXPORT_SYMBOL_GPL(xas_get_mark); + +/** + * xas_set_mark() - Sets the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Sets the specified mark on this entry, and walks up the tree setting it + * on all the ancestor entries. Does nothing if @xas has not been walked to + * an entry, or is in an error state. + */ +void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (node_set_mark(node, offset, mark)) + return; + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (!xa_marked(xas->xa, mark)) + xa_mark_set(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_set_mark); + +/** + * xas_clear_mark() - Clears the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Clears the specified mark on this entry, and walks back to the head + * attempting to clear it on all the ancestor entries. Does nothing if + * @xas has not been walked to an entry, or is in an error state. + */ +void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (!node_clear_mark(node, offset, mark)) + return; + if (node_any_mark(node, mark)) + return; + + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (xa_marked(xas->xa, mark)) + xa_mark_clear(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_clear_mark); + +/** + * xas_init_marks() - Initialise all marks for the entry + * @xas: Array operations state. + * + * Initialise all marks for the entry specified by @xas. If we're tracking + * free entries with a mark, we need to set it on all entries. All other + * marks are cleared. + * + * This implementation is not as efficient as it could be; we may walk + * up the tree multiple times. + */ +void xas_init_marks(const struct xa_state *xas) +{ + xa_mark_t mark = 0; + + for (;;) { + if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) + xas_set_mark(xas, mark); + else + xas_clear_mark(xas, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} +EXPORT_SYMBOL_GPL(xas_init_marks); + +/** + * xas_pause() - Pause a walk to drop a lock. + * @xas: XArray operation state. + * + * Some users need to pause a walk and drop the lock they're holding in + * order to yield to a higher priority thread or carry out an operation + * on an entry. Those users should call this function before they drop + * the lock. It resets the @xas to be suitable for the next iteration + * of the loop after the user has reacquired the lock. If most entries + * found during a walk require you to call xas_pause(), the xa_for_each() + * iterator may be more appropriate. + * + * Note that xas_pause() only works for forward iteration. If a user needs + * to pause a reverse iteration, we will need a xas_pause_rev(). + */ +void xas_pause(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (xas_invalid(xas)) + return; + + if (node) { + unsigned int offset = xas->xa_offset; + while (++offset < XA_CHUNK_SIZE) { + if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) + break; + } + xas->xa_index += (offset - xas->xa_offset) << node->shift; + } else { + xas->xa_index++; + } + xas->xa_node = XAS_RESTART; +} +EXPORT_SYMBOL_GPL(xas_pause); + +/* + * __xas_prev() - Find the previous entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_prev() which handles all the complex cases + * out of line. + */ +void *__xas_prev(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index--; + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset--; + + while (xas->xa_offset == 255) { + xas->xa_offset = xas->xa_node->offset - 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_prev); + +/* + * __xas_next() - Find the next entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_next() which handles all the complex cases + * out of line. + */ +void *__xas_next(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index++; + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset++; + + while (xas->xa_offset == XA_CHUNK_SIZE) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_next); + +/** + * xas_find() - Find the next present entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * If the @xas has not yet been walked to an entry, return the entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we move to the + * next entry. + * + * If no entry is found and the array is smaller than @max, the iterator + * is set to the smallest index not yet in the array. This allows @xas + * to be immediately passed to xas_store(). + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find(struct xa_state *xas, unsigned long max) +{ + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + return set_bounds(xas); + } else if (xas_top(xas->xa_node)) { + entry = xas_load(xas); + if (entry || xas_not_node(xas->xa_node)) + return entry; + } else if (!xas->xa_node->shift && + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { + xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; + } + + xas_advance(xas); + + while (xas->xa_node && (xas->xa_index <= max)) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_node(entry)) { + xas->xa_node = xa_to_node(entry); + xas->xa_offset = 0; + continue; + } + if (entry && !xa_is_sibling(entry)) + return entry; + + xas_advance(xas); + } + + if (!xas->xa_node) + xas->xa_node = XAS_BOUNDS; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find); + +/** + * xas_find_marked() - Find the next marked entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark number to search for. + * + * If the @xas has not yet been walked to an entry, return the marked entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we return the + * first marked entry with an index > xas.xa_index. + * + * If no marked entry is found and the array is smaller than @max, @xas is + * set to the bounds state and xas->xa_index is set to the smallest index + * not yet in the array. This allows @xas to be immediately passed to + * xas_store(). + * + * If no entry is found before @max is reached, @xas is set to the restart + * state. + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) +{ + bool advance = true; + unsigned int offset; + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + goto out; + } else if (xas_top(xas->xa_node)) { + advance = false; + entry = xa_head(xas->xa); + xas->xa_node = NULL; + if (xas->xa_index > max_index(entry)) + goto bounds; + if (!xa_is_node(entry)) { + if (xa_marked(xas->xa, mark)) + return entry; + xas->xa_index = 1; + goto out; + } + xas->xa_node = xa_to_node(entry); + xas->xa_offset = xas->xa_index >> xas->xa_node->shift; + } + + while (xas->xa_index <= max) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + advance = false; + continue; + } + + if (!advance) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_sibling(entry)) { + xas->xa_offset = xa_to_sibling(entry); + xas_move_index(xas, xas->xa_offset); + } + } + + offset = xas_find_chunk(xas, advance, mark); + if (offset > xas->xa_offset) { + advance = false; + xas_move_index(xas, offset); + /* Mind the wrap */ + if ((xas->xa_index - 1) >= max) + goto max; + xas->xa_offset = offset; + if (offset == XA_CHUNK_SIZE) + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } + +out: + if (!max) + goto max; +bounds: + xas->xa_node = XAS_BOUNDS; + return NULL; +max: + xas->xa_node = XAS_RESTART; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_marked); + +/** + * xas_find_conflict() - Find the next present entry in a range. + * @xas: XArray operation state. + * + * The @xas describes both a range and a position within that range. + * + * Context: Any context. Expects xa_lock to be held. + * Return: The next entry in the range covered by @xas or %NULL. + */ +void *xas_find_conflict(struct xa_state *xas) +{ + void *curr; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) + return NULL; + + if (xas_top(xas->xa_node)) { + curr = xas_start(xas); + if (!curr) + return NULL; + while (xa_is_node(curr)) { + struct xa_node *node = xa_to_node(curr); + curr = xas_descend(xas, node); + } + if (curr) + return curr; + } + + if (xas->xa_node->shift > xas->xa_shift) + return NULL; + + for (;;) { + if (xas->xa_node->shift == xas->xa_shift) { + if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) + break; + } else if (xas->xa_offset == XA_CHUNK_MASK) { + xas->xa_offset = xas->xa_node->offset; + xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + continue; + } + curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); + if (xa_is_sibling(curr)) + continue; + while (xa_is_node(curr)) { + xas->xa_node = xa_to_node(curr); + xas->xa_offset = 0; + curr = xa_entry_locked(xas->xa, xas->xa_node, 0); + } + if (curr) + return curr; + } + xas->xa_offset -= xas->xa_sibs; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_conflict); + +/** + * xa_init_flags() - Initialise an empty XArray with flags. + * @xa: XArray. + * @flags: XA_FLAG values. + * + * If you need to initialise an XArray with special flags (eg you need + * to take the lock from interrupt context), use this function instead + * of xa_init(). + * + * Context: Any context. + */ +void xa_init_flags(struct xarray *xa, gfp_t flags) +{ + unsigned int lock_type; + static struct lock_class_key xa_lock_irq; + static struct lock_class_key xa_lock_bh; + + spin_lock_init(&xa->xa_lock); + xa->xa_flags = flags; + xa->xa_head = NULL; + + lock_type = xa_lock_type(xa); + if (lock_type == XA_LOCK_IRQ) + lockdep_set_class(&xa->xa_lock, &xa_lock_irq); + else if (lock_type == XA_LOCK_BH) + lockdep_set_class(&xa->xa_lock, &xa_lock_bh); +} +EXPORT_SYMBOL(xa_init_flags); + +/** + * xa_load() - Load an entry from an XArray. + * @xa: XArray. + * @index: index into array. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry at @index in @xa. + */ +void *xa_load(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (xa_is_zero(entry)) + entry = NULL; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + return entry; +} +EXPORT_SYMBOL(xa_load); + +static void *xas_result(struct xa_state *xas, void *curr) +{ + if (xa_is_zero(curr)) + return NULL; + XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr)); + if (xas_error(xas)) + curr = xas->xa_node; + return curr; +} + +/** + * __xa_erase() - Erase this entry from the XArray while locked. + * @xa: XArray. + * @index: Index into array. + * + * If the entry at this index is a multi-index entry then all indices will + * be erased, and the entry will no longer be a multi-index entry. + * This function expects the xa_lock to be held on entry. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index. + */ +void *__xa_erase(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + return xas_result(&xas, xas_store(&xas, NULL)); +} +EXPORT_SYMBOL_GPL(__xa_erase); + +/** + * xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from this index will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry + * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation + * failed. + */ +void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(xa_store); + +/** + * __xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_store); + +/** + * xa_cmpxchg() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * If the entry at @index is the same as @old, replace it with @entry. + * If the return value is equal to @old, then the exchange was successful. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +void *xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + curr = xas_load(&xas); + if (curr == XA_ZERO_ENTRY) + curr = NULL; + if (curr == old) { + xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(xa_cmpxchg); + +/** + * __xa_cmpxchg() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_load(&xas); + if (curr == XA_ZERO_ENTRY) + curr = NULL; + if (curr == old) { + xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + } + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_cmpxchg); + +/** + * xa_reserve() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * Ensures there is somewhere to store an entry at @index in the array. + * If there is already something stored at @index, this function does + * nothing. If there was nothing there, the entry is marked as reserved. + * Loads from @index will continue to see a %NULL pointer until a + * subsequent store to @index. + * + * If you do not use the entry that you have reserved, call xa_release() + * or xa_erase() to free any unnecessary memory. + * + * Context: Process context. Takes and releases the xa_lock, IRQ or BH safe + * if specified in XArray flags. May sleep if the @gfp flags permit. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + unsigned int lock_type = xa_lock_type(xa); + void *curr; + + do { + xas_lock_type(&xas, lock_type); + curr = xas_load(&xas); + if (!curr) + xas_store(&xas, XA_ZERO_ENTRY); + xas_unlock_type(&xas, lock_type); + } while (xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(xa_reserve); + +#ifdef CONFIG_XARRAY_MULTI +static void xas_set_range(struct xa_state *xas, unsigned long first, + unsigned long last) +{ + unsigned int shift = 0; + unsigned long sibs = last - first; + unsigned int offset = XA_CHUNK_MASK; + + xas_set(xas, first); + + while ((first & XA_CHUNK_MASK) == 0) { + if (sibs < XA_CHUNK_MASK) + break; + if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) + break; + shift += XA_CHUNK_SHIFT; + if (offset == XA_CHUNK_MASK) + offset = sibs & XA_CHUNK_MASK; + sibs >>= XA_CHUNK_SHIFT; + first >>= XA_CHUNK_SHIFT; + } + + offset = first & XA_CHUNK_MASK; + if (offset + sibs > XA_CHUNK_MASK) + sibs = XA_CHUNK_MASK - offset; + if ((((first + sibs + 1) << shift) - 1) > last) + sibs -= 1; + + xas->xa_shift = shift; + xas->xa_sibs = sibs; +} + +/** + * xa_store_range() - Store this entry at a range of indices in the XArray. + * @xa: XArray. + * @first: First index to affect. + * @last: Last index to affect. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from any index between @first and @last, + * inclusive will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in + * an XArray, or xa_err(-ENOMEM) if memory allocation failed. + */ +void *xa_store_range(struct xarray *xa, unsigned long first, + unsigned long last, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + if (last < first) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + if (entry) { + unsigned int order = (last == ~0UL) ? 64 : + ilog2(last + 1); + xas_set_order(&xas, last, order); + xas_create(&xas); + if (xas_error(&xas)) + goto unlock; + } + do { + xas_set_range(&xas, first, last); + xas_store(&xas, entry); + if (xas_error(&xas)) + goto unlock; + first += xas_size(&xas); + } while (first <= last); +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, NULL); +} +EXPORT_SYMBOL(xa_store_range); +#endif /* CONFIG_XARRAY_MULTI */ + +/** + * __xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + int err; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return -EINVAL; + if (WARN_ON_ONCE(!xa_track_free(xa))) + return -EINVAL; + + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + xas.xa_index = *id; + xas_find_marked(&xas, max, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -ENOSPC); + xas_store(&xas, entry); + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + err = xas_error(&xas); + if (!err) + *id = xas.xa_index; + return err; +} +EXPORT_SYMBOL(__xa_alloc); + +/** + * __xa_set_mark() - Set this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_set_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_set_mark); + +/** + * __xa_clear_mark() - Clear this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_clear_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_clear_mark); + +/** + * xa_get_mark() - Inquire whether this mark is set on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * This function uses the RCU read lock, so the result may be out of date + * by the time it returns. If you need the result to be stable, use a lock. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: True if the entry at @index has this mark set, false if it doesn't. + */ +bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + entry = xas_start(&xas); + while (xas_get_mark(&xas, mark)) { + if (!xa_is_node(entry)) + goto found; + entry = xas_descend(&xas, xa_to_node(entry)); + } + rcu_read_unlock(); + return false; + found: + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(xa_get_mark); + +/** + * xa_set_mark() - Set this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_set_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_set_mark); + +/** + * xa_clear_mark() - Clear this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Clearing a mark always succeeds. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_clear_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_clear_mark); + +/** + * xa_find() - Search the XArray for an entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter, and has the lowest + * index that is at least @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may not find + * entries which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry, if found, otherwise %NULL. + */ +void *xa_find(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find); + +/** + * xa_find_after() - Search the XArray for a present entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter and has the lowest + * index that is above @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may miss entries + * which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The pointer, if found, otherwise %NULL. + */ +void *xa_find_after(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp + 1); + void *entry; + + rcu_read_lock(); + for (;;) { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + if (xas.xa_shift) { + if (xas.xa_index & ((1UL << xas.xa_shift) - 1)) + continue; + } else { + if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK)) + continue; + } + if (!xas_retry(&xas, entry)) + break; + } + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find_after); + +static unsigned int xas_extract_present(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each(xas, entry, max) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n, xa_mark_t mark) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each_marked(xas, entry, max, mark) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * xa_extract() - Copy selected entries from the XArray into a normal array. + * @xa: The source XArray to copy from. + * @dst: The buffer to copy entries into. + * @start: The first index in the XArray eligible to be selected. + * @max: The last index in the XArray eligible to be selected. + * @n: The maximum number of entries to copy. + * @filter: Selection criterion. + * + * Copies up to @n entries that match @filter from the XArray. The + * copied entries will have indices between @start and @max, inclusive. + * + * The @filter may be an XArray mark value, in which case entries which are + * marked with that mark will be copied. It may also be %XA_PRESENT, in + * which case all entries which are not NULL will be copied. + * + * The entries returned may not represent a snapshot of the XArray at a + * moment in time. For example, if another thread stores to index 5, then + * index 10, calling xa_extract() may return the old contents of index 5 + * and the new contents of index 10. Indices not modified while this + * function is running will not be skipped. + * + * If you need stronger guarantees, holding the xa_lock across calls to this + * function will prevent concurrent modification. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The number of entries copied. + */ +unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t filter) +{ + XA_STATE(xas, xa, start); + + if (!n) + return 0; + + if ((__force unsigned int)filter < XA_MAX_MARKS) + return xas_extract_marked(&xas, dst, max, n, filter); + return xas_extract_present(&xas, dst, max, n); +} +EXPORT_SYMBOL(xa_extract); + +/** + * xa_destroy() - Free all internal data structures. + * @xa: XArray. + * + * After calling this function, the XArray is empty and has freed all memory + * allocated for its internal data structures. You are responsible for + * freeing the objects referenced by the XArray. + * + * Context: Any context. Takes and releases the xa_lock, interrupt-safe. + */ +void xa_destroy(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long flags; + void *entry; + + xas.xa_node = NULL; + xas_lock_irqsave(&xas, flags); + entry = xa_head_locked(xa); + RCU_INIT_POINTER(xa->xa_head, NULL); + xas_init_marks(&xas); + /* lockdep checks we're still holding the lock in xas_free_nodes() */ + if (xa_is_node(entry)) + xas_free_nodes(&xas, xa_to_node(entry)); + xas_unlock_irqrestore(&xas, flags); +} +EXPORT_SYMBOL(xa_destroy); + +#ifdef XA_DEBUG +void xa_dump_node(const struct xa_node *node) +{ + unsigned i, j; + + if (!node) + return; + if ((unsigned long)node & 3) { + pr_cont("node %px\n", node); + return; + } + + pr_cont("node %px %s %d parent %px shift %d count %d values %d " + "array %px list %px %px marks", + node, node->parent ? "offset" : "max", node->offset, + node->parent, node->shift, node->count, node->nr_values, + node->array, node->private_list.prev, node->private_list.next); + for (i = 0; i < XA_MAX_MARKS; i++) + for (j = 0; j < XA_MARK_LONGS; j++) + pr_cont(" %lx", node->marks[i][j]); + pr_cont("\n"); +} + +void xa_dump_index(unsigned long index, unsigned int shift) +{ + if (!shift) + pr_info("%lu: ", index); + else if (shift >= BITS_PER_LONG) + pr_info("0-%lu: ", ~0UL); + else + pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); +} + +void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) +{ + if (!entry) + return; + + xa_dump_index(index, shift); + + if (xa_is_node(entry)) { + if (shift == 0) { + pr_cont("%px\n", entry); + } else { + unsigned long i; + struct xa_node *node = xa_to_node(entry); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + xa_dump_entry(node->slots[i], + index + (i << node->shift), node->shift); + } + } else if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (!xa_is_internal(entry)) + pr_cont("%px\n", entry); + else if (xa_is_retry(entry)) + pr_cont("retry (%ld)\n", xa_to_internal(entry)); + else if (xa_is_sibling(entry)) + pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else if (xa_is_zero(entry)) + pr_cont("zero (%ld)\n", xa_to_internal(entry)); + else + pr_cont("UNKNOWN ENTRY (%px)\n", entry); +} + +void xa_dump(const struct xarray *xa) +{ + void *entry = xa->xa_head; + unsigned int shift = 0; + + pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, + xa->xa_flags, xa_marked(xa, XA_MARK_0), + xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); + if (xa_is_node(entry)) + shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; + xa_dump_entry(entry, 0, shift); +} +#endif diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c index 25a5d87e2e4c..912aae5fa09e 100644 --- a/lib/xz/xz_crc32.c +++ b/lib/xz/xz_crc32.c @@ -15,7 +15,6 @@ * but they are bigger and use more memory for the lookup table. */ -#include #include "xz_private.h" /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 482b90f363fe..09360ebb510e 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -102,6 +102,10 @@ # endif #endif +#ifndef CRC32_POLY_LE +#define CRC32_POLY_LE 0xedb88320 +#endif + /* * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used * before calling xz_dec_lzma2_run(). diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index 58a733b10387..48f14cd58c77 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -382,6 +382,7 @@ int zlib_inflate(z_streamp strm, int flush) strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; + /* fall through */ case DICT: if (state->havedict == 0) { RESTORE(); @@ -389,8 +390,10 @@ int zlib_inflate(z_streamp strm, int flush) } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; + /* fall through */ case TYPE: if (flush == Z_BLOCK) goto inf_leave; + /* fall through */ case TYPEDO: if (state->last) { BYTEBITS(); @@ -428,6 +431,7 @@ int zlib_inflate(z_streamp strm, int flush) state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; + /* fall through */ case COPY: copy = state->length; if (copy) { @@ -461,6 +465,7 @@ int zlib_inflate(z_streamp strm, int flush) #endif state->have = 0; state->mode = LENLENS; + /* fall through */ case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); @@ -481,6 +486,7 @@ int zlib_inflate(z_streamp strm, int flush) } state->have = 0; state->mode = CODELENS; + /* fall through */ case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { @@ -554,6 +560,7 @@ int zlib_inflate(z_streamp strm, int flush) break; } state->mode = LEN; + /* fall through */ case LEN: if (have >= 6 && left >= 258) { RESTORE(); @@ -593,6 +600,7 @@ int zlib_inflate(z_streamp strm, int flush) } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; + /* fall through */ case LENEXT: if (state->extra) { NEEDBITS(state->extra); @@ -600,6 +608,7 @@ int zlib_inflate(z_streamp strm, int flush) DROPBITS(state->extra); } state->mode = DIST; + /* fall through */ case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; @@ -625,6 +634,7 @@ int zlib_inflate(z_streamp strm, int flush) state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; + /* fall through */ case DISTEXT: if (state->extra) { NEEDBITS(state->extra); @@ -644,6 +654,7 @@ int zlib_inflate(z_streamp strm, int flush) break; } state->mode = MATCH; + /* fall through */ case MATCH: if (left == 0) goto inf_leave; copy = out - left; @@ -694,6 +705,7 @@ int zlib_inflate(z_streamp strm, int flush) INITBITS(); } state->mode = DONE; + /* fall through */ case DONE: ret = Z_STREAM_END; goto inf_leave; diff --git a/mm/Kconfig b/mm/Kconfig index de64ea658716..d85e39da47ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -127,9 +127,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK - bool - config HAVE_MEMBLOCK_NODE_MAP bool @@ -142,9 +139,6 @@ config HAVE_GENERIC_GUP config ARCH_DISCARD_MEMBLOCK bool -config NO_BOOTMEM - bool - config MEMORY_ISOLATION bool @@ -379,7 +373,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -481,7 +475,7 @@ config FRONTSWAP config CMA bool "Contiguous Memory Allocator" - depends on HAVE_MEMBLOCK && MMU + depends on MMU select MIGRATION select MEMORY_ISOLATION help @@ -634,7 +628,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -671,7 +664,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Device memory hotplug support allows for establishing pmem, diff --git a/mm/Makefile b/mm/Makefile index 26ef77a3883b..d210cc9d6f80 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,9 +23,9 @@ KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o \ - page_vma_mapped.o pagewalk.o pgtable-generic.o \ - rmap.o vmalloc.o + mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ + msync.o page_vma_mapped.o pagewalk.o \ + pgtable-generic.o rmap.o vmalloc.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -42,17 +42,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ debug.o $(mmu-y) obj-y += init-mm.o - -ifdef CONFIG_NO_BOOTMEM - obj-y += nobootmem.o -else - obj-y += bootmem.o -endif +obj-y += memblock.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif -obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o diff --git a/mm/bootmem.c b/mm/bootmem.c deleted file mode 100644 index 97db0e8e362b..000000000000 --- a/mm/bootmem.c +++ /dev/null @@ -1,811 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" - -/** - * DOC: bootmem overview - * - * Bootmem is a boot-time physical memory allocator and configurator. - * - * It is used early in the boot process before the page allocator is - * set up. - * - * Bootmem is based on the most basic of allocators, a First Fit - * allocator which uses a bitmap to represent memory. If a bit is 1, - * the page is allocated and 0 if unallocated. To satisfy allocations - * of sizes smaller than a page, the allocator records the Page Frame - * Number (PFN) of the last allocation and the offset the allocation - * ended at. Subsequent small allocations are merged together and - * stored on the same page. - * - * The information used by the bootmem allocator is represented by - * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES - * such structures is statically allocated and then it is discarded - * when the system initialization completes. Each entry in this array - * corresponds to a node with memory. For UMA systems only entry 0 is - * used. - * - * The bootmem allocator is initialized during early architecture - * specific setup. Each architecture is required to supply a - * :c:func:`setup_arch` function which, among other tasks, is - * responsible for acquiring the necessary parameters to initialise - * the boot memory allocator. These parameters define limits of usable - * physical memory: - * - * * @min_low_pfn - the lowest PFN that is available in the system - * * @max_low_pfn - the highest PFN that may be addressed by low - * memory (%ZONE_NORMAL) - * * @max_pfn - the last PFN available to the system. - * - * After those limits are determined, the :c:func:`init_bootmem` or - * :c:func:`init_bootmem_node` function should be called to initialize - * the bootmem allocator. The UMA case should use the `init_bootmem` - * function. It will initialize ``contig_page_data`` structure that - * represents the only memory node in the system. In the NUMA case the - * `init_bootmem_node` function should be called to initialize the - * bootmem allocator for each node. - * - * Once the allocator is set up, it is possible to use either single - * node or NUMA variant of the allocation APIs. - */ - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { - .bdata = &bootmem_node_data[0] -}; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; - -static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); - -static int bootmem_debug; - -static int __init bootmem_debug_setup(char *buf) -{ - bootmem_debug = 1; - return 0; -} -early_param("bootmem_debug", bootmem_debug_setup); - -#define bdebug(fmt, args...) ({ \ - if (unlikely(bootmem_debug)) \ - pr_info("bootmem::%s " fmt, \ - __func__, ## args); \ -}) - -static unsigned long __init bootmap_bytes(unsigned long pages) -{ - unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); - - return ALIGN(bytes, sizeof(long)); -} - -/** - * bootmem_bootmap_pages - calculate bitmap size in pages - * @pages: number of pages the bitmap has to represent - * - * Return: the number of pages needed to hold the bitmap. - */ -unsigned long __init bootmem_bootmap_pages(unsigned long pages) -{ - unsigned long bytes = bootmap_bytes(pages); - - return PAGE_ALIGN(bytes) >> PAGE_SHIFT; -} - -/* - * link bdata in order - */ -static void __init link_bootmem(bootmem_data_t *bdata) -{ - bootmem_data_t *ent; - - list_for_each_entry(ent, &bdata_list, list) { - if (bdata->node_min_pfn < ent->node_min_pfn) { - list_add_tail(&bdata->list, &ent->list); - return; - } - } - - list_add_tail(&bdata->list, &bdata_list); -} - -/* - * Called once to set up the allocator itself. - */ -static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, - unsigned long mapstart, unsigned long start, unsigned long end) -{ - unsigned long mapsize; - - mminit_validate_memmodel_limits(&start, &end); - bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); - bdata->node_min_pfn = start; - bdata->node_low_pfn = end; - link_bootmem(bdata); - - /* - * Initially all pages are reserved - setup_arch() has to - * register free RAM areas explicitly. - */ - mapsize = bootmap_bytes(end - start); - memset(bdata->node_bootmem_map, 0xff, mapsize); - - bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", - bdata - bootmem_node_data, start, mapstart, end, mapsize); - - return mapsize; -} - -/** - * init_bootmem_node - register a node as boot memory - * @pgdat: node to register - * @freepfn: pfn where the bitmap for this node is to be placed - * @startpfn: first pfn on the node - * @endpfn: first pfn after the node - * - * Return: the number of bytes needed to hold the bitmap for this node. - */ -unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, - unsigned long startpfn, unsigned long endpfn) -{ - return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); -} - -/** - * init_bootmem - register boot memory - * @start: pfn where the bitmap is to be placed - * @pages: number of available physical pages - * - * Return: the number of bytes needed to hold the bitmap. - */ -unsigned long __init init_bootmem(unsigned long start, unsigned long pages) -{ - max_low_pfn = pages; - min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); -} - -void __init free_bootmem_late(unsigned long physaddr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(physaddr, size); - - cursor = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) -{ - struct page *page; - unsigned long *map, start, end, pages, cur, count = 0; - - if (!bdata->node_bootmem_map) - return 0; - - map = bdata->node_bootmem_map; - start = bdata->node_min_pfn; - end = bdata->node_low_pfn; - - bdebug("nid=%td start=%lx end=%lx\n", - bdata - bootmem_node_data, start, end); - - while (start < end) { - unsigned long idx, vec; - unsigned shift; - - idx = start - bdata->node_min_pfn; - shift = idx & (BITS_PER_LONG - 1); - /* - * vec holds at most BITS_PER_LONG map bits, - * bit 0 corresponds to start. - */ - vec = ~map[idx / BITS_PER_LONG]; - - if (shift) { - vec >>= shift; - if (end - start >= BITS_PER_LONG) - vec |= ~map[idx / BITS_PER_LONG + 1] << - (BITS_PER_LONG - shift); - } - /* - * If we have a properly aligned and fully unreserved - * BITS_PER_LONG block of pages in front of us, free - * it in one go. - */ - if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { - int order = ilog2(BITS_PER_LONG); - - __free_pages_bootmem(pfn_to_page(start), start, order); - count += BITS_PER_LONG; - start += BITS_PER_LONG; - } else { - cur = start; - - start = ALIGN(start + 1, BITS_PER_LONG); - while (vec && cur != start) { - if (vec & 1) { - page = pfn_to_page(cur); - __free_pages_bootmem(page, cur, 0); - count++; - } - vec >>= 1; - ++cur; - } - } - } - - cur = bdata->node_min_pfn; - page = virt_to_page(bdata->node_bootmem_map); - pages = bdata->node_low_pfn - bdata->node_min_pfn; - pages = bootmem_bootmap_pages(pages); - count += pages; - while (pages--) - __free_pages_bootmem(page++, cur++, 0); - bdata->node_bootmem_map = NULL; - - bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -unsigned long __init free_all_bootmem(void) -{ - unsigned long total_pages = 0; - bootmem_data_t *bdata; - - reset_all_zones_managed_pages(); - - list_for_each_entry(bdata, &bdata_list, list) - total_pages += free_all_bootmem_core(bdata); - - totalram_pages += total_pages; - - return total_pages; -} - -static void __init __free(bootmem_data_t *bdata, - unsigned long sidx, unsigned long eidx) -{ - unsigned long idx; - - bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return; - - if (bdata->hint_idx > sidx) - bdata->hint_idx = sidx; - - for (idx = sidx; idx < eidx; idx++) - if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) - BUG(); -} - -static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, - unsigned long eidx, int flags) -{ - unsigned long idx; - int exclusive = flags & BOOTMEM_EXCLUSIVE; - - bdebug("nid=%td start=%lx end=%lx flags=%x\n", - bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn, - flags); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return 0; - - for (idx = sidx; idx < eidx; idx++) - if (test_and_set_bit(idx, bdata->node_bootmem_map)) { - if (exclusive) { - __free(bdata, sidx, idx); - return -EBUSY; - } - bdebug("silent double reserve of PFN %lx\n", - idx + bdata->node_min_pfn); - } - return 0; -} - -static int __init mark_bootmem_node(bootmem_data_t *bdata, - unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long sidx, eidx; - - bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", - bdata - bootmem_node_data, start, end, reserve, flags); - - BUG_ON(start < bdata->node_min_pfn); - BUG_ON(end > bdata->node_low_pfn); - - sidx = start - bdata->node_min_pfn; - eidx = end - bdata->node_min_pfn; - - if (reserve) - return __reserve(bdata, sidx, eidx, flags); - else - __free(bdata, sidx, eidx); - return 0; -} - -static int __init mark_bootmem(unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long pos; - bootmem_data_t *bdata; - - pos = start; - list_for_each_entry(bdata, &bdata_list, list) { - int err; - unsigned long max; - - if (pos < bdata->node_min_pfn || - pos >= bdata->node_low_pfn) { - BUG_ON(pos != start); - continue; - } - - max = min(bdata->node_low_pfn, end); - - err = mark_bootmem_node(bdata, pos, max, reserve, flags); - if (reserve && err) { - mark_bootmem(start, pos, 0, 0); - return err; - } - - if (max == end) - return 0; - pos = bdata->node_low_pfn; - } - BUG(); -} - -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -} - -void __init free_bootmem(unsigned long physaddr, unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem(start, end, 0, 0); -} - -/** - * reserve_bootmem_node - mark a page range as reserved - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must reside completely on the specified node. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size, int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(physaddr); - end = PFN_UP(physaddr + size); - - return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -} - -/** - * reserve_bootmem - mark a page range as reserved - * @addr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must be contiguous but may span node boundaries. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem(unsigned long addr, unsigned long size, - int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(addr); - end = PFN_UP(addr + size); - - return mark_bootmem(start, end, 1, flags); -} - -static unsigned long __init align_idx(struct bootmem_data *bdata, - unsigned long idx, unsigned long step) -{ - unsigned long base = bdata->node_min_pfn; - - /* - * Align the index with respect to the node start so that the - * combination of both satisfies the requested alignment. - */ - - return ALIGN(base + idx, step) - base; -} - -static unsigned long __init align_off(struct bootmem_data *bdata, - unsigned long off, unsigned long align) -{ - unsigned long base = PFN_PHYS(bdata->node_min_pfn); - - /* Same as align_idx for byte offsets */ - - return ALIGN(base + off, align) - base; -} - -static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - unsigned long fallback = 0; - unsigned long min, max, start, sidx, midx, step; - - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - - BUG_ON(!size); - BUG_ON(align & (align - 1)); - BUG_ON(limit && goal + size > limit); - - if (!bdata->node_bootmem_map) - return NULL; - - min = bdata->node_min_pfn; - max = bdata->node_low_pfn; - - goal >>= PAGE_SHIFT; - limit >>= PAGE_SHIFT; - - if (limit && max > limit) - max = limit; - if (max <= min) - return NULL; - - step = max(align >> PAGE_SHIFT, 1UL); - - if (goal && min < goal && goal < max) - start = ALIGN(goal, step); - else - start = ALIGN(min, step); - - sidx = start - bdata->node_min_pfn; - midx = max - bdata->node_min_pfn; - - if (bdata->hint_idx > sidx) { - /* - * Handle the valid case of sidx being zero and still - * catch the fallback below. - */ - fallback = sidx + 1; - sidx = align_idx(bdata, bdata->hint_idx, step); - } - - while (1) { - int merge; - void *region; - unsigned long eidx, i, start_off, end_off; -find_block: - sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = align_idx(bdata, sidx, step); - eidx = sidx + PFN_UP(size); - - if (sidx >= midx || eidx > midx) - break; - - for (i = sidx; i < eidx; i++) - if (test_bit(i, bdata->node_bootmem_map)) { - sidx = align_idx(bdata, i, step); - if (sidx == i) - sidx += step; - goto find_block; - } - - if (bdata->last_end_off & (PAGE_SIZE - 1) && - PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = align_off(bdata, bdata->last_end_off, align); - else - start_off = PFN_PHYS(sidx); - - merge = PFN_DOWN(start_off) < sidx; - end_off = start_off + size; - - bdata->last_end_off = end_off; - bdata->hint_idx = PFN_UP(end_off); - - /* - * Reserve the area now: - */ - if (__reserve(bdata, PFN_DOWN(start_off) + merge, - PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) - BUG(); - - region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + - start_off); - memset(region, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(region, size, 0, 0); - return region; - } - - if (fallback) { - sidx = align_idx(bdata, fallback - 1, step); - fallback = 0; - goto find_block; - } - - return NULL; -} - -static void * __init alloc_bootmem_core(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - bootmem_data_t *bdata; - void *region; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - - list_for_each_entry(bdata, &bdata_list, list) { - if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) - continue; - if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) - break; - - region = alloc_bootmem_bdata(bdata, size, align, goal, limit); - if (region) - return region; - } - - return NULL; -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -restart: - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - if (goal) { - goal = 0; - goto restart; - } - - return NULL; -} - -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -again: - - /* do not panic in alloc_bootmem_bdata() */ - if (limit && goal + size > limit) - limit = 0; - - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); - if (ptr) - return ptr; - - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ -#ifdef MAX_DMA32_PFN - unsigned long end_pfn; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat_end_pfn(pgdat); - - if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && - (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { - void *ptr; - unsigned long new_goal; - - new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, - new_goal, 0); - if (ptr) - return ptr; - } -#endif - - return __alloc_bootmem_node(pgdat, size, align, goal); - -} - -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -} diff --git a/mm/compaction.c b/mm/compaction.c index faca45ebe62d..7c607479de4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2068,11 +2069,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/debug.c b/mm/debug.c index bd10aad8539a..cdacba12e09a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" @@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm) ); } +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..81adec8ee02c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -111,60 +113,26 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; -} - -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; + + mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -263,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, @@ -306,7 +274,7 @@ void delete_from_page_cache(struct page *page) EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@ -319,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache); * * The function expects the i_pages lock to be held. */ -static void -page_cache_tree_delete_batch(struct address_space *mapping, +static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@ -344,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@ -356,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@ -381,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) @@ -491,20 +457,31 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@ -775,51 +752,44 @@ EXPORT_SYMBOL(file_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; - - get_page(new); - new->mapping = mapping; - new->index = offset; + get_page(new); + new->mapping = mapping; + new->index = offset; - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -828,12 +798,15 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@ -842,39 +815,47 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: +error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** @@ -915,12 +896,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; @@ -1076,8 +1054,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; int ret = 0; + if (bit_nr == PG_locked && + !PageUptodate(page) && PageWorkingset(page)) { + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1116,6 +1104,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), @@ -1326,86 +1320,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_next_hole); +EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - struct page *page; + XA_STATE(xas, &mapping->i_pages, index); - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_prev_hole); +EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry @@ -1422,47 +1406,40 @@ EXPORT_SYMBOL(page_cache_prev_hole); */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@ -1493,7 +1470,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { @@ -1539,7 +1516,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; @@ -1625,53 +1602,48 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1702,64 +1674,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) @@ -1787,57 +1745,43 @@ out: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; /* * must check mapping and index after taking the ref. * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { + if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { put_page(page); break; } @@ -1845,6 +1789,11 @@ repeat: pages[ret] = page; if (++ret == nr_pages) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1864,74 +1813,58 @@ EXPORT_SYMBOL(find_get_pages_contig); * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; @@ -1957,57 +1890,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -2122,7 +2049,7 @@ find_page: !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; /* pipes can't handle partially uptodate pages */ - if (unlikely(iter->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(iter))) goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; @@ -2581,9 +2508,7 @@ no_cached_page: * system is low on memory, or a problem occurs while trying * to schedule I/O. */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); page_not_uptodate: /* @@ -2613,45 +2538,31 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@ -2670,10 +2581,10 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2686,8 +2597,6 @@ next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } @@ -2748,9 +2657,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } #else -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { - return -ENOSYS; + return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -2797,7 +2706,7 @@ repeat: put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } @@ -2915,6 +2824,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); +/* + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. + */ +static int generic_access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + *count = min(*count, max_size - pos); + return 0; +} + +static int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + return generic_access_check_limits(file, pos, count); +} + /* * Performs necessary checks before doing a write * @@ -2926,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - unsigned long limit = rlimit(RLIMIT_FSIZE); - loff_t pos; + loff_t count; + int ret; if (!iov_iter_count(from)) return 0; @@ -2936,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - pos = iocb->ki_pos; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - if (limit != RLIM_INFINITY) { - if (iocb->ki_pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - iov_iter_truncate(from, limit - (unsigned long)pos); - } + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_access_check_limits(file_in, pos_in, &count); + if (ret) + return ret; + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; /* - * LFS rule + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. */ - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (pos >= MAX_NON_LFS) - return -EFBIG; - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; } + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + /* - * Are we about to exceed the fs block limit ? - * - * If we have written data it becomes a short write. If we have - * exceeded without writing data we send a signal and return EFBIG. - * Linus frestrict idea will clean these up nicely.. + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. */ - if (unlikely(pos >= inode->i_sb->s_maxbytes)) - return -EFBIG; + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); - return iov_iter_count(from); + *req_count = count; + return 0; } -EXPORT_SYMBOL(generic_write_checks); int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -3012,7 +3013,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + iov_iter_count(from))) + pos + write_len)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, diff --git a/mm/gup.c b/mm/gup.c index 1abc8b4afff6..f76e77a2d34b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -20,6 +20,11 @@ #include "internal.h" +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -116,8 +121,8 @@ retry: * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ - pgmap = get_dev_pagemap(pte_pfn(pte), NULL); - if (pgmap) + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) page = pte_page(pte); else goto no_page; @@ -152,15 +157,8 @@ retry: goto retry; } - if (flags & FOLL_GET) { + if (flags & FOLL_GET) get_page(page); - - /* drop the pgmap reference now that we hold the page */ - if (pgmap) { - put_dev_pagemap(pgmap); - pgmap = NULL; - } - } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -210,7 +208,8 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -258,13 +257,13 @@ retry: } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); @@ -284,7 +283,7 @@ retry_locked: } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT) { int ret; @@ -307,18 +306,18 @@ retry_locked: } return ret ? ERR_PTR(ret) : - follow_page_pte(vma, address, pmd, flags); + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; + ctx->page_mask = HPAGE_PMD_NR - 1; return page; } - static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; @@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; @@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, page_mask); + return follow_pmd_mask(vma, address, pud, flags, ctx); } - static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; @@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - return follow_pud_mask(vma, address, p4d, flags, page_mask); + return follow_pud_mask(vma, address, p4d, flags, ctx); } /** @@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - unsigned int *page_mask) + struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; + ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); @@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return no_page_table(vma, flags); } - return follow_p4d_mask(vma, address, pgd, flags, page_mask); + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; - unsigned int page_mask; + long ret = 0, i = 0; struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; @@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pages ? &pages[i] : NULL); if (ret) return i ? : ret; - page_mask = 0; + ctx.page_mask = 0; goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) - return i ? : -EFAULT; + if (!vma || check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -709,23 +722,26 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; + if (unlikely(fatal_signal_pending(current))) { + ret = -ERESTARTSYS; + goto out; + } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &page_mask); + + page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - int ret; ret = faultin_page(tsk, vma, start, &foll_flags, nonblocking); switch (ret) { case 0: goto retry; + case -EBUSY: + ret = 0; + /* FALLTHRU */ case -EFAULT: case -ENOMEM: case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; + goto out; case -ENOENT: goto next_page; } @@ -737,27 +753,31 @@ retry: */ goto next_page; } else if (IS_ERR(page)) { - return i ? i : PTR_ERR(page); + ret = PTR_ERR(page); + goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); - page_mask = 0; + ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; - page_mask = 0; + ctx.page_mask = 0; } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); - return i; +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, @@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - unsigned long addr, len, end; + unsigned long len, end; unsigned long flags; int nr = 0; start &= PAGE_MASK; - addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; @@ -1798,8 +1817,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * interrupts disabled by get_futex_key. * * With interrupts disabled, we block page table pages from being - * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h - * for more details. + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. @@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages, write)) { local_irq_save(flags); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(start, end, write, pages, &nr); local_irq_restore(flags); } diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6a473709e9b6..5b42d3d4b60a 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -6,22 +6,30 @@ #include #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; __u32 flags; + __u64 expansion[10]; /* For future use */ }; static int __gup_benchmark_ioctl(unsigned int cmd, struct gup_benchmark *gup) { ktime_t start_time, end_time; - unsigned long i, nr, nr_pages, addr, next; + unsigned long i, nr_pages, addr, next; + int nr; struct page **pages; + if (gup->size > ULONG_MAX) + return -EINVAL; + nr_pages = gup->size / PAGE_SIZE; pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); if (!pages) @@ -40,21 +48,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + switch (cmd) { + case GUP_FAST_BENCHMARK: + nr = get_user_pages_fast(addr, nr, gup->flags & 1, + pages + i); + break; + case GUP_LONGTERM_BENCHMARK: + nr = get_user_pages_longterm(addr, nr, gup->flags & 1, + pages + i, NULL); + break; + case GUP_BENCHMARK: + nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, + NULL); + break; + default: + return -1; + } + if (nr <= 0) break; i += nr; } end_time = ktime_get(); - gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->get_delta_usec = ktime_us_delta(end_time, start_time); gup->size = addr - gup->addr; + start_time = ktime_get(); for (i = 0; i < nr_pages; i++) { if (!pages[i]) break; put_page(pages[i]); } + end_time = ktime_get(); + gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); return 0; @@ -66,8 +93,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, struct gup_benchmark gup; int ret; - if (cmd != GUP_FAST_BENCHMARK) + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_LONGTERM_BENCHMARK: + case GUP_BENCHMARK: + break; + default: return -EINVAL; + } if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; diff --git a/mm/hmm.c b/mm/hmm.c index c968e49f7a0c..90c34f3d1243 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Jérôme Glisse + * Authors: Jérôme Glisse */ /* * Refer to include/linux/hmm.h for information about heterogeneous memory @@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * * @mm: mm struct this HMM struct is bound to * @lock: lock protecting ranges list - * @sequence: we track updates to the CPU page table with a sequence number * @ranges: list of range being snapshotted * @mirrors: list of mirrors for this mm * @mmu_notifier: mmu notifier to track updates to CPU page table @@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; struct hmm { struct mm_struct *mm; spinlock_t lock; - atomic_t sequence; struct list_head ranges; struct list_head mirrors; struct mmu_notifier mmu_notifier; @@ -85,22 +83,11 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); - atomic_set(&hmm->sequence, 0); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->lock); hmm->mm = mm; - /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() - */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { - kfree(hmm); - return NULL; - } - spin_lock(&mm->page_table_lock); if (!mm->hmm) mm->hmm = hmm; @@ -108,12 +95,27 @@ static struct hmm *hmm_register(struct mm_struct *mm) cleanup = true; spin_unlock(&mm->page_table_lock); - if (cleanup) { - mmu_notifier_unregister(&hmm->mmu_notifier, mm); - kfree(hmm); - } + if (cleanup) + goto error; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) + goto error_mm; return mm->hmm; + +error_mm: + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); +error: + kfree(hmm); + return NULL; } void hmm_mm_destroy(struct mm_struct *mm) @@ -121,10 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm) kfree(mm->hmm); } -static void hmm_invalidate_range(struct hmm *hmm, - enum hmm_update_type action, - unsigned long start, - unsigned long end) +static int hmm_invalidate_range(struct hmm *hmm, bool device, + const struct hmm_update *update) { struct hmm_mirror *mirror; struct hmm_range *range; @@ -133,22 +133,33 @@ static void hmm_invalidate_range(struct hmm *hmm, list_for_each_entry(range, &hmm->ranges, list) { unsigned long addr, idx, npages; - if (end < range->start || start >= range->end) + if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(start, range->start); + addr = max(update->start, range->start); idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(&hmm->lock); + if (!device) + return 0; + down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) - mirror->ops->sync_cpu_device_pagetables(mirror, action, - start, end); + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); + if (!update->blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + return -EAGAIN; + } + } up_read(&hmm->mirrors_sem); + + return 0; } static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -178,18 +189,21 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - bool blockable) + struct mm_struct *mm, + unsigned long start, + unsigned long end, + bool blockable) { + struct hmm_update update; struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); - atomic_inc(&hmm->sequence); - - return 0; + update.start = start; + update.end = end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = blockable; + return hmm_invalidate_range(hmm, true, &update); } static void hmm_invalidate_range_end(struct mmu_notifier *mn, @@ -197,11 +211,16 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { + struct hmm_update update; struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); - hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); + update.start = start; + update.end = end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = true; + hmm_invalidate_range(hmm, false, &update); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -278,12 +297,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) if (!should_unregister || mm == NULL) return; + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&mm->page_table_lock); if (mm->hmm == hmm) mm->hmm = NULL; spin_unlock(&mm->page_table_lock); - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -571,22 +591,42 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; + pmd_t pmd; - i = (addr - range->start) >> PAGE_SHIFT; again: - if (pmd_none(*pmdp)) + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) + if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - pmd_t pmd; + if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + bool fault, write_fault; + unsigned long npages; + uint64_t *pfns; + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + if (fault || write_fault) { + hmm_vma_walk->last = addr; + pmd_migration_entry_wait(vma->vm_mm, pmdp); + return -EAGAIN; + } + return 0; + } else if (!pmd_present(pmd)) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* * No need to take pmd_lock here, even if some other threads * is splitting the huge pmd we will get that event through @@ -601,13 +641,21 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; + i = (addr - range->start) >> PAGE_SHIFT; return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } - if (pmd_bad(*pmdp)) + /* + * We have handled all the valid case above ie either none, migration, + * huge or transparent huge. At this point either it is a valid pmd + * entry pointing to pte directory or it is a bad pmd that will not + * recover. + */ + if (pmd_bad(pmd)) return hmm_pfns_bad(start, end, walk); ptep = pte_offset_map(pmdp, addr); + i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { int r; @@ -1024,7 +1072,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) resource_size_t key, align_start, align_size, align_end; struct device *device = devmem->device; int ret, nid, is_ram; - unsigned long pfn; align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(devmem->resource->start + @@ -1109,11 +1156,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, &devmem->pagemap); - page->pgmap = &devmem->pagemap; - } return 0; error_add_memory: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 533f9b00147d..55478ab3c83b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -629,21 +629,40 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + gfp_t this_node = 0; + +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); - return GFP_TRANSHUGE_LIGHT; + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ @@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -852,11 +871,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -886,12 +904,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1000,11 +1017,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1028,12 +1044,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1290,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; @@ -1562,8 +1578,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked @@ -1780,7 +1808,7 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) + pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; @@ -1811,7 +1839,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); - if (pmd_present(pmd) && pmd_dirty(pmd)) + if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); @@ -1822,12 +1850,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, } pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); - if (new_ptl != old_ptl) - spin_unlock(new_ptl); if (force_flush) flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); - else - *need_flush = true; + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } @@ -2371,6 +2397,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); @@ -2443,13 +2470,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } @@ -2561,7 +2588,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@ -2657,17 +2684,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } @@ -2885,9 +2909,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (!(pvmw->pmd && !pvmw->pte)) return; - mmu_notifier_invalidate_range_start(mm, address, - address + HPAGE_PMD_SIZE); - flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = *pvmw->pmd; pmdp_invalidate(vma, address, pvmw->pmd); @@ -2900,9 +2921,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, true); put_page(page); - - mmu_notifier_invalidate_range_end(mm, address, - address + HPAGE_PMD_SIZE); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -2931,7 +2949,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c21775f196b..c007fb5fb8d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -2100,9 +2100,9 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_raw( + addr = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), - 0, BOOTMEM_ALLOC_ACCESSIBLE, node); + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - const unsigned long mmun_start = start; /* For mmu_notifiers */ - const unsigned long mmun_end = end; /* For mmu_notifiers */ + unsigned long mmun_start = start; /* For mmu_notifiers */ + unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); + + /* + * If sharing possible, alert mmu notifiers of worst case. + */ + adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { @@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { spin_unlock(ptl); + /* + * We just unmapped a page of PMDs by clearing a PUD. + * The caller's TLB flush range should cover this area. + */ continue; } @@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm; struct mmu_gather tlb; + unsigned long tlb_start = start; + unsigned long tlb_end = end; + + /* + * If shared PMDs were possibly used within this vma range, adjust + * start/end for worst case tlb flushing. + * Note that we can not be sure if PMDs are shared until we try to + * unmap pages. However, we want to make sure TLB flushing covers + * the largest possible range. + */ + adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb, tlb_start, tlb_end); } /* @@ -3670,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); @@ -4298,11 +4324,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; + unsigned long f_start = start; + unsigned long f_end = end; + bool shared_pmd = false; + + /* + * In the case of shared PMDs, the area to flush could be beyond + * start/end. Set f_start/f_end to cover the maximum possible + * range if PMD sharing is possible. + */ + adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); BUG_ON(address >= end); - flush_cache_range(vma, address, end); + flush_cache_range(vma, f_start, f_end); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, f_start, f_end); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4313,6 +4349,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (huge_pmd_unshare(mm, &address, ptep)) { pages++; spin_unlock(ptl); + shared_pmd = true; continue; } pte = huge_ptep_get(ptep); @@ -4348,9 +4385,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. + * and that page table be reused and filled with junk. If we actually + * did unshare a page of pmds, flush the range corresponding to the pud. */ - flush_hugetlb_tlb_range(vma, start, end); + if (shared_pmd) + flush_hugetlb_tlb_range(vma, f_start, f_end); + else + flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. @@ -4358,7 +4399,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, f_start, f_end); return pages << h->order; } @@ -4545,12 +4586,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } +/* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -4648,6 +4717,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/internal.h b/mm/internal.h index 87256ae1bef8..291eb2b6d1d8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -161,7 +161,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned long pfn, +extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 7a2a2f13f86f..c7550eb65922 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -10,11 +10,10 @@ * */ -#include +#include #include #include #include -#include #include #include #include @@ -83,8 +82,8 @@ static inline bool kasan_zero_page_entry(pte_t pte) static __init void *early_alloc(size_t size, int node) { - return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); } static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..b209dbaefde8 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -103,7 +103,7 @@ static int quarantine_head; static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; -static DEFINE_SPINLOCK(quarantine_lock); +static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ @@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - spin_lock(&quarantine_lock); + raw_spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock(&quarantine_lock); + raw_spin_unlock(&quarantine_lock); } local_irq_restore(flags); @@ -230,7 +230,7 @@ void quarantine_reduce(void) * expected case). */ srcu_idx = srcu_read_lock(&remove_cache_srcu); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of @@ -254,7 +254,7 @@ void quarantine_reduce(void) quarantine_head = 0; } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); srcu_read_unlock(&remove_cache_srcu, srcu_idx); @@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); /* Scanning whole quarantine can take a while. */ - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); cond_resched(); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a31d740e6cd1..c13625c1ad5e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1288,17 +1288,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * * Basic scheme is simple, details are more complex: * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one + * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; - * + keep old pages around in case if rollback is required; - * - if replacing succeed: + * + keep old pages around in case rollback is required; + * - if replacing succeeds: * + copy data over; * + free old pages; * + unfreeze huge page; * - if replacing failed; * + put all pages back and unfreeze them; - * + restore gaps in the radix-tree; + * + restore gaps in the page cache; * + free huge page; */ static void collapse_shmem(struct mm_struct *mm, @@ -1306,12 +1306,11 @@ static void collapse_shmem(struct mm_struct *mm, struct page **hpage, int node) { gfp_t gfp; - struct page *page, *new_page, *tmp; + struct page *new_page; struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1336,48 +1335,49 @@ static void collapse_shmem(struct mm_struct *mm, __SetPageLocked(new_page); BUG_ON(!page_ref_freeze(new_page, 1)); - /* - * At this point the new_page is 'frozen' (page_count() is zero), locked - * and not up-to-date. It's safe to insert it into radix tree, because - * nobody would be able to map it or use it in other way until we - * unfreeze it. + * At this point the new_page is 'frozen' (page_count() is zero), + * locked and not up-to-date. It's safe to insert it into the page + * cache, because nobody would be able to map it or use it in other + * way until we unfreeze it. */ - index = start; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - int n = min(iter.index, end) - index; - - /* - * Handle holes in the radix tree: charge it from shmem and - * insert relevant subpage of new_page into the radix-tree. - */ - if (n && !shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; + /* This will be less messy when we use multi-index entries */ + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) break; - } - nr_none += n; - for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) + goto out; + } while (1); - /* We are done. */ - if (index >= end) - break; + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); + + VM_BUG_ON(index != xas.xa_index); + if (!page) { + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; + break; + } + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + nr_none++; + continue; + } - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - xa_unlock_irq(&mapping->i_pages); + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; - goto tree_unlocked; + goto xa_unlocked; } - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); } else { @@ -1397,7 +1397,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1407,17 +1407,16 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); - slot = radix_tree_lookup_slot(&mapping->i_pages, index); - VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock), page); + VM_BUG_ON_PAGE(page != xas_load(&xas), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: * - we hold a pin on it; - * - one reference from radix tree; + * - one reference from page cache; * - one from isolate_lru_page; */ if (!page_ref_freeze(page, 3)) { @@ -1432,56 +1431,30 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->i_pages, slot, - new_page + (index % HPAGE_PMD_NR)); - - slot = radix_tree_iter_resume(slot, &iter); - index++; + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; out_lru: - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); putback_lru_page(page); out_isolate_failed: unlock_page(page); put_page(page); - goto tree_unlocked; + goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); break; } + xas_unlock_irq(&xas); - /* - * Handle hole in radix tree at the end of the range. - * This code only triggers if there's nothing in radix tree - * beyond 'end'. - */ - if (result == SCAN_SUCCEED && index < end) { - int n = end - index; - - if (!shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - goto tree_locked; - } - - for (; index < end; index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } - nr_none += n; - } - -tree_locked: - xa_unlock_irq(&mapping->i_pages); -tree_unlocked: - +xa_unlocked: if (result == SCAN_SUCCEED) { - unsigned long flags; + struct page *page, *tmp; struct zone *zone = page_zone(new_page); /* - * Replacing old pages with new one has succeed, now we need to - * copy the content and free old pages. + * Replacing old pages with new one has succeeded, now we + * need to copy the content and free the old pages. */ list_for_each_entry_safe(page, tmp, &pagelist, lru) { copy_highpage(new_page + (page->index % HPAGE_PMD_NR), @@ -1495,16 +1468,16 @@ tree_unlocked: put_page(page); } - local_irq_save(flags); + local_irq_disable(); __inc_node_page_state(new_page, NR_SHMEM_THPS); if (nr_none) { __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } - local_irq_restore(flags); + local_irq_enable(); /* - * Remove pte page tables, so we can re-faulti + * Remove pte page tables, so we can re-fault * the page as huge. */ retract_page_tables(mapping, start); @@ -1521,37 +1494,37 @@ tree_unlocked: khugepaged_pages_collapsed++; } else { - /* Something went wrong: rollback changes to the radix-tree */ + struct page *page; + /* Something went wrong: roll back page cache changes */ shmem_uncharge(mapping->host, nr_none); - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; + xas_lock_irq(&xas); + xas_set(&xas, start); + xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, struct page, lru); - if (!page || iter.index < page->index) { + if (!page || xas.xa_index < page->index) { if (!nr_none) break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->i_pages, iter.index); + xas_store(&xas, NULL); continue; } - VM_BUG_ON_PAGE(page->index != iter.index, page); + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->i_pages, slot, page); - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + xas_store(&xas, page); + xas_pause(&xas); + xas_unlock_irq(&xas); putback_lru_page(page); unlock_page(page); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } VM_BUG_ON(nr_none); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1569,8 +1542,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, pgoff_t start, struct page **hpage) { struct page *page = NULL; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; @@ -1579,17 +1551,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= start + HPAGE_PMD_NR) - break; - - page = radix_tree_deref_slot(slot); - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, page)) continue; - } - if (radix_tree_exception(page)) { + if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; break; @@ -1628,7 +1594,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17dd883198ae..877de4fa0720 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -86,12 +86,13 @@ #include #include #include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -181,6 +182,7 @@ struct kmemleak_object { /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +#define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 /* number of bytes to print at a time (1, 2, 4, 8) */ @@ -235,6 +237,9 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; +static bool kmemleak_verbose; +module_param_named(verbose, kmemleak_verbose, bool, 0600); + /* * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may @@ -299,6 +304,25 @@ static void kmemleak_disable(void); kmemleak_disable(); \ } while (0) +#define warn_or_seq_printf(seq, fmt, ...) do { \ + if (seq) \ + seq_printf(seq, fmt, ##__VA_ARGS__); \ + else \ + pr_warn(fmt, ##__VA_ARGS__); \ +} while (0) + +static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, + int rowsize, int groupsize, const void *buf, + size_t len, bool ascii) +{ + if (seq) + seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, + buf, len, ascii); + else + print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, + rowsize, groupsize, buf, len, ascii); +} + /* * Printing of the objects hex dump to the seq file. The number of lines to be * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The @@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq, /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - seq_printf(seq, " hex dump (first %zu bytes):\n", len); + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); - seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, ptr, len, HEX_ASCII); kasan_enable_current(); } @@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq, int i; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); - seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", object->comm, object->pid, object->jiffies, msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); - seq_printf(seq, " backtrace:\n"); + warn_or_seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -1598,6 +1622,10 @@ static void kmemleak_scan(void) if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; + + if (kmemleak_verbose) + print_unreferenced(NULL, object); + new_leaks++; } spin_unlock_irqrestore(&object->lock, flags); diff --git a/mm/maccess.c b/mm/maccess.c index ec00be51a24f..f3416632e5a4 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -30,8 +30,10 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, size); + current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -58,7 +60,9 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -94,11 +98,13 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; do { ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + current->kernel_uaccess_faults_ok--; dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); diff --git a/mm/madvise.c b/mm/madvise.c index 972a9eaa898b..6cb1ca93e290 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { error = -EINVAL; goto out; } @@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; diff --git a/mm/memblock.c b/mm/memblock.c index 237944479d25..7df468c8ebc8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -82,6 +81,16 @@ * initialization compltes. */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; +unsigned long long max_possible_pfn; + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -1238,8 +1247,11 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, { phys_addr_t found; - if (!align) + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); align = SMP_CACHE_BYTES; + } found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1269,7 +1281,7 @@ phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } -phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; @@ -1304,23 +1316,22 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys return alloc; } -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align) { return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t res = memblock_alloc_nid(size, align, nid); + phys_addr_t res = memblock_phys_alloc_nid(size, align, nid); if (res) return res; return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -#if defined(CONFIG_NO_BOOTMEM) /** - * memblock_virt_alloc_internal - allocate boot memory block + * memblock_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region to allocate (phys address) @@ -1333,9 +1344,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * hold the requested memory. * * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. - * - * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1346,7 +1355,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -static void * __init memblock_virt_alloc_internal( +static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1361,13 +1370,15 @@ static void * __init memblock_virt_alloc_internal( /* * Detect any accidental use of these APIs after slab is ready, as at * this moment memblock may be deinitialized already and its - * internal data may be destroyed (after execution of free_all_bootmem) + * internal data may be destroyed (after execution of memblock_free_all) */ if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) + if (!align) { + dump_stack(); align = SMP_CACHE_BYTES; + } if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; @@ -1413,14 +1424,14 @@ done: } /** - * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1431,7 +1442,7 @@ done: * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_raw( +void * __init memblock_alloc_try_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1442,23 +1453,22 @@ void * __init memblock_virt_alloc_try_nid_raw( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); -#ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, PAGE_POISON_PATTERN, size); -#endif + page_init_poison(ptr, size); + return ptr; } /** - * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1468,7 +1478,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_nopanic( +void * __init memblock_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1479,7 +1489,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) memset(ptr, 0, size); @@ -1487,24 +1497,24 @@ void * __init memblock_virt_alloc_try_nid_nopanic( } /** - * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * memblock_alloc_try_nid - allocate boot memory block with panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid( +void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1514,7 +1524,7 @@ void * __init memblock_virt_alloc_try_nid( memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { memset(ptr, 0, size); @@ -1525,14 +1535,13 @@ void * __init memblock_virt_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } -#endif /** * __memblock_free_early - free boot memory block * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) @@ -1566,7 +1575,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); + memblock_free_pages(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -1880,6 +1889,100 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int order; + + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); + + while (start + (1UL << order) > end) + order--; + + memblock_free_pages(pfn_to_page(start), start, order); + + start += (1UL << order); + } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn >= end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + +static unsigned long __init free_low_memory_core_early(void) +{ + unsigned long count = 0; + phys_addr_t start, end; + u64 i; + + memblock_clear_hotplug(0, -1); + + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) + count += __free_memory_core(start, end); + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * memblock_free_all - release free pages to the buddy allocator + * + * Return: the number of pages actually released. + */ +unsigned long __init memblock_free_all(void) +{ + unsigned long pages; + + reset_all_zones_managed_pages(); + + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; +} + #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..6e1469b80cb7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@ -2250,8 +2252,6 @@ retry: if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer @@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) @@ -2615,7 +2593,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) struct mem_cgroup *memcg; int ret = 0; - if (memcg_kmem_bypass()) + if (mem_cgroup_disabled() || memcg_kmem_bypass()) return 0; memcg = get_mem_cgroup_from_current(); @@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } @@ -4750,7 +4728,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; @@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - return 0; } @@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. diff --git a/mm/memfd.c b/mm/memfd.c index 2bb5e257080e..97264c79d2cd 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -21,44 +21,36 @@ #include /* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void memfd_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct xa_state *xas) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_lock_irq(xas); + xas_for_each(xas, page, ULONG_MAX) { + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) > 1) + xas_set_mark(xas, MEMFD_TAG_PINNED); + + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(xas); + xas_unlock_irq(xas); + cond_resched(); + xas_lock_irq(xas); } - rcu_read_unlock(); + xas_unlock_irq(xas); } /* @@ -72,17 +64,17 @@ static void memfd_tag_pins(struct address_space *mapping) */ static int memfd_wait_for_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; + XA_STATE(xas, &mapping->i_pages, 0); struct page *page; int error, scan; - memfd_tag_pins(mapping); + memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + unsigned int tagged = 0; + + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) @@ -90,45 +82,34 @@ static int memfd_wait_for_pins(struct address_space *mapping) else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - + xas_set(&xas, 0); + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + bool clear = true; + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ - error = -EBUSY; + if (scan == LAST_SCAN) + error = -EBUSY; + else + clear = false; } + if (clear) + xas_clear_mark(&xas, MEMFD_TAG_PINNED); + if (++tagged % XA_CHECK_SCHED) + continue; - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } - rcu_read_unlock(); + xas_unlock_irq(&xas); } return error; diff --git a/mm/memory.c b/mm/memory.c index c467102a5cbc..4ad2d293ddc2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -186,253 +186,6 @@ static void check_sync_rss_stat(struct task_struct *task) #endif /* SPLIT_RSS_COUNTING */ -#ifdef HAVE_GENERIC_MMU_GATHER - -static bool tlb_next_batch(struct mmu_gather *tlb) -{ - struct mmu_gather_batch *batch; - - batch = tlb->active; - if (batch->next) { - tlb->active = batch->next; - return true; - } - - if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return false; - - batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - if (!batch) - return false; - - tlb->batch_count++; - batch->next = NULL; - batch->nr = 0; - batch->max = MAX_GATHER_BATCH; - - tlb->active->next = batch; - tlb->active = batch; - - return true; -} - -void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - - /* Is it from 0 to ~0? */ - tlb->fullmm = !(start | (end+1)); - tlb->need_flush_all = 0; - tlb->local.next = NULL; - tlb->local.nr = 0; - tlb->local.max = ARRAY_SIZE(tlb->__pages); - tlb->active = &tlb->local; - tlb->batch_count = 0; - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; -#endif - tlb->page_size = 0; - - __tlb_reset_range(tlb); -} - -static void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - struct mmu_gather_batch *batch; - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif - for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - free_pages_and_swap_cache(batch->pages, batch->nr); - batch->nr = 0; - } - tlb->active = &tlb->local; -} - -void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - struct mmu_gather_batch *batch, *next; - - if (force) - __tlb_adjust_range(tlb, start, end - start); - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - for (batch = tlb->local.next; batch; batch = next) { - next = batch->next; - free_pages((unsigned long)batch, 0); - } - tlb->local.next = NULL; -} - -/* __tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while - * handling the additional races in SMP caused by other CPUs caching valid - * mappings in their TLBs. Returns the number of free page slots left. - * When out of page slots we must call tlb_flush_mmu(). - *returns true if the caller should flush. - */ -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) -{ - struct mmu_gather_batch *batch; - - VM_BUG_ON(!tlb->end); - VM_WARN_ON(tlb->page_size != page_size); - - batch = tlb->active; - /* - * Add the page and check if we are full. If so - * force a flush. - */ - batch->pages[batch->nr++] = page; - if (batch->nr == batch->max) { - if (!tlb_next_batch(tlb)) - return true; - batch = tlb->active; - } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); - - return false; -} - -#endif /* HAVE_GENERIC_MMU_GATHER */ - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - -/* - * See the comment near struct mmu_table_batch. - */ - -/* - * If we want tlb_remove_table() to imply TLB invalidates. - */ -static inline void tlb_table_invalidate(struct mmu_gather *tlb) -{ -#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE - /* - * Invalidate page-table caches used by hardware walkers. Then we still - * need to RCU-sched wait while freeing the pages because software - * walkers can still be in-flight. - */ - tlb_flush_mmu_tlbonly(tlb); -#endif -} - -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely on - * IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - tlb_table_invalidate(tlb); - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch == NULL) { - *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - tlb_table_invalidate(tlb); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_table_flush(tlb); -} - -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ - -/** - * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down - * @tlb: the mmu_gather structure to initialize - * @mm: the mm_struct of the target address space - * @start: start of the region that will be removed from the page-table - * @end: end of the region that will be removed from the page-table - * - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @start and @end are set to 0 and -1 - * respectively when @mm is without users and we're going to destroy - * the full address space (exit/execve). - */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - arch_tlb_gather_mmu(tlb, mm, start, end); - inc_tlb_flush_pending(tlb->mm); -} - -void tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) -{ - /* - * If there are parallel threads are doing PTE changes on same range - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB - * flush by batching, a thread has stable TLB entry can fail to flush - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB - * forcefully if we detect parallel PTE batching threads. - */ - bool force = mm_tlb_flush_nested(tlb->mm); - - arch_tlb_finish_mmu(tlb, start, end, force); - dec_tlb_flush_pending(tlb->mm); -} - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1767,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; - int retval; pte_t *pte, entry; spinlock_t *ptl; - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; - retval = -EBUSY; + return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* @@ -1787,10 +1537,15 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared - * mapping and we expect the PFNs to match. + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. */ - if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; + } entry = *pte; goto out_mkwrite; } else @@ -1812,56 +1567,32 @@ out_mkwrite: set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ - retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_pfn); /** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vm_insert_pfn, except that it allows drivers to + * This is exactly like vmf_insert_pfn(), except that it allows drivers to * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * COW mappings. In general, using multiple vmas is preferable; + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. */ -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1875,19 +1606,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); - return ret; +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } -EXPORT_SYMBOL(vm_insert_pfn_prot); +EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { @@ -1903,20 +1659,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) return false; } -static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, bool mkwrite) +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; + int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1935,36 +1692,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot); + } else { + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } - return insert_pfn(vma, addr, pfn, pgprot, mkwrite); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); - } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ - vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { - int err; - - err = __vm_insert_mixed(vma, addr, pfn, true); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - return VM_FAULT_NOPAGE; + return __vm_insert_mixed(vma, addr, pfn, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); @@ -3745,10 +3501,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ - if (!vma->vm_ops->fault) - ret = VM_FAULT_SIGBUS; - else if (!(vmf->flags & FAULT_FLAG_WRITE)) + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 38d94b703e9d..2b2b3ccbbfb5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -587,6 +586,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + cond_resched(); ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, altmap); map_offset = 0; @@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); - enum zone_type zone_last = ZONE_NORMAL; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; - /* - * if the memory to be online is in a zone of 0...zone_last, and - * the zones of 0...zone_last don't have memory before online, we will - * need to set the node to node_states[N_NORMAL_MEMORY] after - * the memory is online. - */ - if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; - else - arg->status_change_nid_normal = -1; - #ifdef CONFIG_HIGHMEM - /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. - */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif - - /* - * if the node don't have memory befor online, we will need to - * set the node to node_states[N_MEMORY] after the memory - * is online. - */ - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - else - arg->status_change_nid = -1; } static void node_states_set_node(int node, struct memory_notify *arg) @@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); - node_set_state(node, N_MEMORY); + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, @@ -881,7 +839,6 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, return zone; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -893,6 +850,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; struct memory_block *mem; + mem_hotplug_begin(); + /* * We can't use pfn_to_nid() because nid might be stored in struct page * which is not yet initialized. Instead, we find nid from memory block. @@ -957,6 +916,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + mem_hotplug_done(); return 0; failed_addition: @@ -964,6 +924,7 @@ failed_addition: (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + mem_hotplug_done(); return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1111,7 +1072,12 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; @@ -1163,26 +1129,26 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* device_online() will take the lock when calling online_pages() */ + mem_hotplug_done(); + /* online pages if requested */ if (online) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block); - goto out; - + return ret; error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); memblock_remove(start, size); - -out: mem_hotplug_done(); return ret; } -EXPORT_SYMBOL_GPL(add_memory_resource); -int __ref add_memory(int nid, u64 start, u64 size) +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; @@ -1196,6 +1162,17 @@ int __ref add_memory(int nid, u64 start, u64 size) release_memory_resource(res); return ret; } + +int add_memory(int nid, u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size); + unlock_device_hotplug(); + + return rc; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -1505,75 +1482,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages, { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; - enum zone_type zt, zone_last = ZONE_NORMAL; + enum zone_type zt; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; /* - * check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is in a zone of 0...zone_last, - * and it is the last present memory, 0...zone_last will - * become empty after offline , thus we can determind we will - * need to clear the node from node_states[N_NORMAL_MEMORY]. + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. */ - for (zt = 0; zt <= zone_last; zt++) + for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); - else - arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. + * node_states[N_HIGH_MEMORY] contains nodes which + * have normal memory or high memory. + * Here we add the present_pages belonging to ZONE_HIGHMEM. + * If the zone is within the range of [0..ZONE_HIGHMEM), and + * we determine that the zones in that range become empty, + * we need to clear the node for N_HIGH_MEMORY. */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; + if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* - * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + * We have accounted the pages from [0..ZONE_NORMAL), and + * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM + * as well. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. */ - zone_last = ZONE_MOVABLE; + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - /* - * check whether node_states[N_HIGH_MEMORY] will be changed - * If we try to offline the last present @nr_pages from the node, - * we can determind we will need to clear the node from - * node_states[N_HIGH_MEMORY]. - */ - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); - else - arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) @@ -1581,12 +1536,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid_high >= 0)) + if (arg->status_change_nid_high >= 0) node_clear_state(node, N_HIGH_MEMORY); - if ((N_MEMORY != N_HIGH_MEMORY) && - (arg->status_change_nid >= 0)) + if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } @@ -1606,10 +1559,16 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) return -EINVAL; + + mem_hotplug_begin(); + /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, + &valid_end)) { + mem_hotplug_done(); return -EINVAL; + } zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); @@ -1618,8 +1577,10 @@ static int __ref __offline_pages(unsigned long start_pfn, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); - if (ret) + if (ret) { + mem_hotplug_done(); return ret; + } arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1690,6 +1651,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + mem_hotplug_done(); return 0; failed_removal: @@ -1699,10 +1661,10 @@ failed_removal: memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + mem_hotplug_done(); return ret; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages); @@ -1873,7 +1835,7 @@ EXPORT_SYMBOL(try_offline_node); * and online/offline operations before this call, as required by * try_offline_node(). */ -void __ref remove_memory(int nid, u64 start, u64 size) +void __ref __remove_memory(int nid, u64 start, u64 size) { int ret; @@ -1902,5 +1864,12 @@ void __ref remove_memory(int nid, u64 start, u64 size) mem_hotplug_done(); } + +void remove_memory(int nid, u64 start, u64 size) +{ + lock_device_hotplug(); + __remove_memory(nid, start, size); + unlock_device_hotplug(); +} EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index da858f794eb6..5837a067124d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(unsigned long addr) +static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); + int locked = 1; + err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err >= 0) { err = page_to_nid(p); put_page(p); } + if (locked) + up_read(&mm->mmap_sem); return err; } @@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(addr); + /* + * Take a refcount on the mpol, lookup_node() + * wil drop the mmap_sem, so after calling + * lookup_node() only "pol" remains valid, "vma" + * is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; @@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); + if (pol_refcount) + mpol_put(pol_refcount); return err; } @@ -1102,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -1648,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); @@ -1997,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2008,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2026,32 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && - !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); @@ -2697,12 +2684,11 @@ static const char * const policy_modes[] = int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; - unsigned short mode; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int err = 1; + int err = 1, mode; if (nodelist) { /* NUL-terminate mode or flags string */ @@ -2717,12 +2703,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { - if (!strcmp(str, policy_modes[mode])) { - break; - } - } - if (mode >= MPOL_MAX) + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) goto out; switch (mode) { diff --git a/mm/migrate.c b/mm/migrate.c index d6a2e89b086a..f7e4bfdc13b7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); + if (PageTransHuge(page) && PageMlocked(page)) + clear_page_mlock(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } @@ -323,7 +326,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count + * Once page cache replacement of page migration started, page_count * *must* be zero. And, we don't want to call wait_on_page_locked() * against a page without get_page(). * So, we use get_page_unless_zero(), here. Even failed, page fault @@ -438,10 +441,10 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; int expected_count = 1 + extra_count; - void **pslot; /* * Device public or private pages have an extra refcount as they are @@ -467,21 +470,16 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); + xas_lock_irq(&xas); expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -495,7 +493,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -523,16 +521,13 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@ -543,7 +538,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -583,22 +578,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -607,11 +598,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } @@ -682,6 +673,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -1411,7 +1404,7 @@ retry: * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page)) { + if (PageTransHuge(page) && !PageHuge(page)) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@ -1855,46 +1848,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if the node is migrate rate-limited after the update */ -static bool numamigrate_update_ratelimit(pg_data_t *pgdat, - unsigned long nr_pages) -{ - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { - spin_lock(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies + - msecs_to_jiffies(migrate_interval_millisecs); - spin_unlock(&pgdat->numabalancing_migrate_lock); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, - nr_pages); - return true; - } - - /* - * This is an unlocked non-atomic update so errors are possible. - * The consequences are failing to migrate when we potentiall should - * have which is not severe enough to warrant locking. If it is ever - * a problem, it can be converted to a per-cpu counter. - */ - pgdat->numabalancing_migrate_nr_pages += nr_pages; - return false; -} - static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -1967,14 +1920,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_cache(page) && PageDirty(page)) goto out; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) - goto out; - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; @@ -2018,16 +1963,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -2050,15 +1986,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2082,16 +2018,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. + */ + page_add_anon_rmap(new_page, vma, start, true); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2100,11 +2046,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2125,11 +2066,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); -out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..4985965aa20a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * shmem/tmpfs may return swap: account for swapcache * page too. */ - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); diff --git a/mm/mmap.c b/mm/mmap.c index 5f2b2b184c60..6c04292e16a7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; - unsigned long newbrk, oldbrk; + unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; unsigned long min_brk; bool populate; + bool downgraded = false; LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; + origbrk = mm->brk; + #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting @@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } - /* Always allow shrinking brk. */ + /* + * Always allow shrinking brk. + * __do_munmap() may downgrade mmap_sem to read. + */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) - goto set_brk; - goto out; + int ret; + + /* + * mm->brk must to be protected by write mmap_sem so update it + * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk will be restored from origbrk. + */ + mm->brk = brk; + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); + if (ret < 0) { + mm->brk = origbrk; + goto out; + } else if (ret == 1) { + downgraded = true; + } + goto success; } /* Check against existing mmap mappings. */ @@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; - -set_brk: mm->brk = brk; + +success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - up_write(&mm->mmap_sem); + if (downgraded) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: - retval = mm->brk; + retval = origbrk; up_write(&mm->mmap_sem); return retval; } @@ -1410,7 +1434,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (flags & MAP_FIXED_NOREPLACE) { struct vm_area_struct *vma = find_vma(mm, addr); - if (vma && vma->vm_start <= addr) + if (vma && vma->vm_start < addr + len) return -EEXIST; } @@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } + tmp = tmp->vm_next; } } - /* - * Remove the vma's, and unmap the actual pages - */ + /* Detach vmas from rbtree */ detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + /* + * mpx unmap needs to be called with mmap_sem held for write. + * It is safe to call it before unmap_region(). + */ arch_unmap(mm, vma, start, end); + if (downgrade) + downgrade_write(&mm->mmap_sem); + + unmap_region(mm, vma, prev, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); - return 0; + return downgrade ? 1 : 0; } -int vm_munmap(unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) +{ + return __do_munmap(mm, start, len, uf, false); +} + +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; struct mm_struct *mm = current->mm; @@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); - up_write(&mm->mmap_sem); + ret = __do_munmap(mm, start, len, &uf, downgrade); + /* + * Returning 1 indicates mmap_sem is downgraded. + * But 1 is not legal return value of vm_munmap() and munmap(), reset + * it to 0 before return. + */ + if (ret == 1) { + up_read(&mm->mmap_sem); + ret = 0; + } else + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } + +int vm_munmap(unsigned long start, size_t len) +{ + return __vm_munmap(start, len, false); +} EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr); - return vm_munmap(addr, len); + return __vm_munmap(addr, len, true); } diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c new file mode 100644 index 000000000000..2a9fbc4a37d5 --- /dev/null +++ b/mm/mmu_gather.c @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef HAVE_GENERIC_MMU_GATHER + +static bool tlb_next_batch(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + if (batch->next) { + tlb->active = batch->next; + return true; + } + + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return false; + + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + if (!batch) + return false; + + tlb->batch_count++; + batch->next = NULL; + batch->nr = 0; + batch->max = MAX_GATHER_BATCH; + + tlb->active->next = batch; + tlb->active = batch; + + return true; +} + +void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + tlb->mm = mm; + + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); + tlb->need_flush_all = 0; + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + tlb->batch_count = 0; + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb->batch = NULL; +#endif + tlb->page_size = 0; + + __tlb_reset_range(tlb); +} + +void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb_table_flush(tlb); +#endif + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { + free_pages_and_swap_cache(batch->pages, batch->nr); + batch->nr = 0; + } + tlb->active = &tlb->local; +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + +/* tlb_finish_mmu + * Called at the end of the shootdown operation to free up any resources + * that were required. + */ +void arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end, bool force) +{ + struct mmu_gather_batch *batch, *next; + + if (force) { + __tlb_reset_range(tlb); + __tlb_adjust_range(tlb, start, end - start); + } + + tlb_flush_mmu(tlb); + + /* keep the page table cache within bounds */ + check_pgt_cache(); + + for (batch = tlb->local.next; batch; batch = next) { + next = batch->next; + free_pages((unsigned long)batch, 0); + } + tlb->local.next = NULL; +} + +/* __tlb_remove_page + * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while + * handling the additional races in SMP caused by other CPUs caching valid + * mappings in their TLBs. Returns the number of free page slots left. + * When out of page slots we must call tlb_flush_mmu(). + *returns true if the caller should flush. + */ +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +{ + struct mmu_gather_batch *batch; + + VM_BUG_ON(!tlb->end); + VM_WARN_ON(tlb->page_size != page_size); + + batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; + if (batch->nr == batch->max) { + if (!tlb_next_batch(tlb)) + return true; + batch = tlb->active; + } + VM_BUG_ON_PAGE(batch->nr > batch->max, page); + + return false; +} + +#endif /* HAVE_GENERIC_MMU_GATHER */ + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + +/* + * See the comment near struct mmu_table_batch. + */ + +/* + * If we want tlb_remove_table() to imply TLB invalidates. + */ +static inline void tlb_table_invalidate(struct mmu_gather *tlb) +{ +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE + /* + * Invalidate page-table caches used by hardware walkers. Then we still + * need to RCU-sched wait while freeing the pages because software + * walkers can still be in-flight. + */ + tlb_flush_mmu_tlbonly(tlb); +#endif +} + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely on + * IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + tlb_table_invalidate(tlb); + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch == NULL) { + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + tlb_table_invalidate(tlb); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_table_flush(tlb); +} + +#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ + +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @start and @end are set to 0 and -1 + * respectively when @mm is without users and we're going to destroy + * the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + arch_tlb_gather_mmu(tlb, mm, start, end); + inc_tlb_flush_pending(tlb->mm); +} + +void tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) +{ + /* + * If there are parallel threads are doing PTE changes on same range + * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB + * flush by batching, a thread has stable TLB entry can fail to flush + * the TLB by observing pte_none|!pte_dirty, for example so flush TLB + * forcefully if we detect parallel PTE batching threads. + */ + bool force = mm_tlb_flush_nested(tlb->mm); + + arch_tlb_finish_mmu(tlb, start, end, force); + dec_tlb_flush_pending(tlb->mm); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 82bb1a939c0e..5119ff846769 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -/* - * Must be called while holding mm->mmap_sem for either read or write. - * The result is guaranteed to be valid until mm->mmap_sem is dropped. - */ -bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - struct mmu_notifier *mn; - int id; - bool ret = false; - - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); - - if (!mm_has_notifiers(mm)) - return ret; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (!mn->ops->invalidate_range && - !mn->ops->invalidate_range_start && - !mn->ops->invalidate_range_end) - continue; - - if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { - ret = true; - break; - } - } - srcu_read_unlock(&srcu, id); - return ret; -} - static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mremap.c b/mm/mremap.c index 5c2e18505f75..7f9f9180e401 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -115,7 +115,7 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks, bool *need_flush) + unsigned long new_addr, bool need_rmap_locks) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; @@ -163,15 +163,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte = ptep_get_and_clear(mm, old_addr, old_pte); /* - * If we are remapping a dirty PTE, make sure + * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the - * old PTE or we may race with page_mkclean(). + * PTE. * - * This check has to be done after we removed the - * old PTE from page tables or another thread may - * dirty it after the check and before the removal. + * NOTE! Both old and new PTL matter: the old one + * for racing with page_mkclean(), the new one to + * make sure the physical page stays valid until + * the TLB entry for the old mapping has been + * flushed. */ - if (pte_present(pte) && pte_dirty(pte)) + if (pte_present(pte)) force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); @@ -179,13 +181,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } arch_leave_lazy_mmu_mode(); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); - if (force_flush) - flush_tlb_range(vma, old_end - len, old_end); - else - *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -198,7 +198,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma, { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; - bool need_flush = false; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -229,8 +228,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd, - &need_flush); + old_end, old_pmd, new_pmd); if (need_rmap_locks) drop_rmap_locks(vma); if (moved) @@ -246,10 +244,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > next - new_addr) extent = next - new_addr; move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, - new_pmd, new_addr, need_rmap_locks, &need_flush); + new_pmd, new_addr, need_rmap_locks); } - if (need_flush) - flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -525,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -561,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -631,7 +636,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(¤t->mm->mmap_sem); + if (downgraded) + up_read(¤t->mm->mmap_sem); + else + up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); diff --git a/mm/nobootmem.c b/mm/nobootmem.c deleted file mode 100644 index 439af3b765a7..000000000000 --- a/mm/nobootmem.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "internal.h" - -#ifndef CONFIG_HAVE_MEMBLOCK -#error CONFIG_HAVE_MEMBLOCK not defined -#endif - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - enum memblock_flags flags = choose_memblock_flags(); - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - -again: - addr = memblock_find_in_range_node(size, align, goal, limit, nid, - flags); - if (!addr && (flags & MEMBLOCK_MIRROR)) { - flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", - &size); - goto again; - } - if (!addr) - return NULL; - - if (memblock_reserve(addr, size)) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} - -/** - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(addr, size); - - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int order; - - while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); - - while (start + (1UL << order) > end) - order--; - - __free_pages_bootmem(pfn_to_page(start), start, order); - - start += (1UL << order); - } -} - -static unsigned long __init __free_memory_core(phys_addr_t start, - phys_addr_t end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - - if (start_pfn >= end_pfn) - return 0; - - __free_pages_memory(start_pfn, end_pfn); - - return end_pfn - start_pfn; -} - -static unsigned long __init free_low_memory_core_early(void) -{ - unsigned long count = 0; - phys_addr_t start, end; - u64 i; - - memblock_clear_hotplug(0, -1); - - for_each_reserved_mem_region(i, &start, &end) - reserve_bootmem_region(start, end); - - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) - count += __free_memory_core(start, end); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Return: the number of pages actually released. - */ -unsigned long __init free_all_bootmem(void) -{ - unsigned long pages; - - reset_all_zones_managed_pages(); - - pages = free_low_memory_core_early(); - totalram_pages += pages; - - return pages; -} - -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - memblock_free(physaddr, size); -} - -/** - * free_bootmem - mark a page range as usable - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ -void __init free_bootmem(unsigned long addr, unsigned long size) -{ - memblock_free(addr, size); -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -} - -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Return: address of the allocated region or %NULL on failure. - */ -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -again: - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, limit); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, - goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return __alloc_bootmem_node(pgdat, size, align, goal); -} - - -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} diff --git a/mm/nommu.c b/mm/nommu.c index e4aac33216ae..749276beb109 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) { - *page_mask = 0; return NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f10aa5360616..6589f60d5018 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -859,7 +859,7 @@ static void __oom_kill_process(struct task_struct *victim) * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -897,7 +897,7 @@ static void __oom_kill_process(struct task_struct *victim) */ if (unlikely(p->flags & PF_KTHREAD)) continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); } rcu_read_unlock(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..3f690bae6b78 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,34 +2097,25 @@ void __init page_writeback_init(void) * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2149,6 +2140,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2162,31 +2160,24 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2272,17 +2263,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; @@ -2445,7 +2433,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2471,7 +2459,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@ -2634,13 +2622,13 @@ EXPORT_SYMBOL(__cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2721,7 +2709,7 @@ int test_clear_page_writeback(struct page *page) xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2761,11 +2749,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2773,8 +2763,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2787,12 +2776,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@ -2806,16 +2793,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - return radix_tree_tagged(&mapping->i_pages, tag); -} -EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d2a2ab3fe6..a919ba5cb3c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -66,6 +65,7 @@ #include #include #include +#include #include #include @@ -306,24 +306,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) } /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. */ -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { + static unsigned long prev_end_pfn, nr_initialised; + + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; + } + /* Always populate low zones for address-constrained allocations */ - if (zone_end < pgdat_end_pfn(pgdat)) - return true; - (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_pgcnt) && - (pfn & (PAGES_PER_SECTION - 1)) == 0) { - pgdat->first_deferred_pfn = pfn; + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; + nr_initialised++; + if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; } - - return true; + return false; } #else static inline bool early_page_uninitialised(unsigned long pfn) @@ -331,11 +340,9 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - return true; + return false; } #endif @@ -1231,7 +1238,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } @@ -1326,7 +1338,7 @@ meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) @@ -2015,10 +2027,6 @@ static int move_freepages(struct zone *zone, pfn_valid(page_to_pfn(end_page)) && page_zone(start_page) != page_zone(end_page)); #endif - - if (num_movable) - *num_movable = 0; - for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -2058,6 +2066,9 @@ int move_freepages_block(struct zone *zone, struct page *page, unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; + if (num_movable) + *num_movable = 0; + start_pfn = page_to_pfn(page); start_pfn = start_pfn & ~(pageblock_nr_pages-1); start_page = pfn_to_page(start_pfn); @@ -3366,26 +3377,12 @@ try_this_zone: return NULL; } -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ - bool ret = false; - -#if NODES_SHIFT > 8 - ret = in_interrupt(); -#endif - return ret; -} - static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) + if (!__ratelimit(&show_mem_rs)) return; /* @@ -3549,15 +3546,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3756,11 +3758,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3776,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags); cond_resched(); @@ -3922,6 +3927,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, { struct zone *zone; struct zoneref *z; + bool ret = false; /* * Costly allocations might have made a progress but this doesn't mean @@ -3985,25 +3991,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } } - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that - * a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to - * do a short sleep here rather than calling - * cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - - return true; + ret = true; + goto out; } } - return false; +out: + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return ret; } static inline bool @@ -4701,6 +4706,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4726,19 +4732,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. - */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; @@ -5449,76 +5449,151 @@ void __ref build_all_zonelists(pg_data_t *pgdat) #endif } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } +#endif + return false; +} + /* * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is + * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context, struct vmem_altmap *altmap) { - unsigned long end_pfn = start_pfn + size; - pg_data_t *pgdat = NODE_DATA(nid); - unsigned long pfn; - unsigned long nr_initialised = 0; + unsigned long pfn, end_pfn = start_pfn + size; struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - struct memblock_region *r = NULL, *tmp; -#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE - * memory + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) - goto not_early; - - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) - break; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Check given memblock attribute by firmware which can affect - * kernel memory layout. If zone==ZONE_MOVABLE but memory is - * mirrored, it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) continue; - } + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, end_pfn)) + break; } -#endif -not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + } +} + +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long size, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + return; + + /* + * The call to memmap_init_zone should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (pgmap->altmap_valid) { + struct vmem_altmap *altmap = &pgmap->altmap; + + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + size = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->hmm_data = 0; /* * Mark the block movable so that blocks are reserved for @@ -5540,8 +5615,12 @@ not_early: cond_resched(); } } + + pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + size, jiffies_to_msecs(jiffies - start)); } +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; @@ -5551,10 +5630,11 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) -#endif +void __meminit __weak memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); +} static int zone_batchsize(struct zone *zone) { @@ -6128,7 +6208,7 @@ static void __ref setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = - memblock_virt_alloc_node_nopanic(usemapsize, + memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); } #else @@ -6193,17 +6273,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } -#ifdef CONFIG_NUMA_BALANCING -static void pgdat_init_numabalancing(struct pglist_data *pgdat) -{ - spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; -} -#else -static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} -#endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { @@ -6228,7 +6297,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { pgdat_resize_init(pgdat); - pgdat_init_numabalancing(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); @@ -6370,7 +6438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6439,48 +6507,67 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Zero all valid struct pages in range [spfn, epfn), return number of struct + * pages zeroed + */ +static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} + /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=. */ void __init zero_resv_unavail(void) { phys_addr_t start, end; - unsigned long pfn; u64 i, pgcnt; + phys_addr_t next = 0; /* - * Loop through ranges that are reserved, but do not have reported - * physical memory backing. + * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_resv_unavail_range(i, &start, &end) { - for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } + for_each_mem_range(i, &memblock.memory, NULL, + NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + if (next < start) + pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); + next = end; } + pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); /* * Struct pages that do not have backing memory. This could be because * firmware is using some of this memory, or for some other reasons. - * Once memblock is changed so such behaviour is not allowed: i.e. - * list of "reserved" memory must be a subset of list of "memory", then - * this code can be removed. */ if (pgcnt) - pr_info("Reserved but unavailable: %lld pages", pgcnt); + pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6815,15 +6902,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid) { enum zone_type zone_type; - if (N_MEMORY == N_NORMAL_MEMORY) - return; - for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { - node_set_state(nid, N_HIGH_MEMORY); - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && - zone_type <= ZONE_NORMAL) + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } @@ -7626,9 +7710,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_virt_alloc_nopanic(size, 0); + table = memblock_alloc_nopanic(size, + SMP_CACHE_BYTES); else - table = memblock_virt_alloc_raw(size, 0); + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); } else { diff --git a/mm/page_ext.c b/mm/page_ext.c index a9826da84ccb..ae44f7adbe07 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include +#include #include #include #include @@ -161,9 +161,9 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_virt_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; diff --git a/mm/page_idle.c b/mm/page_idle.c index 6302bc62c27d..b9e4b42b33ab 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include +#include #include #include #include diff --git a/mm/page_io.c b/mm/page_io.c index aafd19ec1db4..d4d1c89bcddd 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -294,7 +294,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; struct iov_iter from; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); @@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/page_owner.c b/mm/page_owner.c index d80adfe702d3..87bc0dfdb52b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page_poison.c b/mm/page_poison.c index aa2b3d34e8ea..f0c15e9017c0 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -17,11 +17,16 @@ static int __init early_page_poison_param(char *buf) } early_param("page_poison", early_page_poison_param); +/** + * page_poisoning_enabled - check if page poisoning is enabled + * + * Return true if page poisoning is enabled, or false if not. + */ bool page_poisoning_enabled(void) { /* * Assumes that debug_pagealloc_enabled is set before - * free_all_bootmem. + * memblock_free_all. * Page poisoning is debug page alloc for some arches. If * either of those options are enabled, enable poisoning. */ @@ -29,6 +34,7 @@ bool page_poisoning_enabled(void) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } +EXPORT_SYMBOL_GPL(page_poisoning_enabled); static void poison_page(struct page *page) { diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ae3c2a35d61b..11df03e71288 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -21,7 +21,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; } else { - if (!pte_present(*pvmw->pte)) + /* + * We get here when we are trying to unmap a private + * device page from the process address space. Such + * page is not CPU accessible and thus is mapped as + * a special swap entry, nonetheless it still does + * count as a valid regular mapping for the page (and + * is accounted as such in page maps count). + * + * So handle this special case as if it was a normal + * page mapping ie lock CPU page table and returns + * true. + * + * For more details on device private memory see HMM + * (include/linux/hmm.h or mm/hmm.c). + */ + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry)) + return false; + } else if (!pte_present(*pvmw->pte)) return false; } } diff --git a/mm/percpu.c b/mm/percpu.c index a749d4d96e3e..db86282fd024 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -65,7 +65,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include @@ -1101,9 +1101,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - 0); + chunk = memblock_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1114,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0]), 0); - chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0]), 0); + chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), + SMP_CACHE_BYTES); + chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), + SMP_CACHE_BYTES); + chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), + SMP_CACHE_BYTES); pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1212,6 +1212,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; + pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); @@ -1887,7 +1888,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2074,12 +2075,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_virt_alloc(ai->nr_groups * - sizeof(group_offsets[0]), 0); - group_sizes = memblock_virt_alloc(ai->nr_groups * - sizeof(group_sizes[0]), 0); - unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); - unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), + SMP_CACHE_BYTES); + group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), + SMP_CACHE_BYTES); + unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), + SMP_CACHE_BYTES); + unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), + SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2143,8 +2146,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_virt_alloc( - pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); + pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), + SMP_CACHE_BYTES); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2457,7 +2460,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_virt_alloc_nopanic(areas_size, 0); + areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2588,7 +2591,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); - if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } @@ -2598,7 +2601,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_virt_alloc(pages_size, 0); + pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; @@ -2687,7 +2690,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_virt_alloc_from_nopanic( + return memblock_alloc_from_nopanic( size, align, __pa(MAX_DMA_ADDRESS)); } @@ -2736,7 +2739,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_virt_alloc_from_nopanic(unit_size, + fc = memblock_alloc_from_nopanic(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cf2af04b34b9..532c29276fce 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -8,6 +8,7 @@ */ #include +#include #include #include diff --git a/mm/readahead.c b/mm/readahead.c index 4e630143a0ba..f3d6f9656a3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -176,10 +176,8 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, if (page_offset > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, page_offset); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of * contiguous pages before continuing with the next @@ -336,7 +334,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = page_cache_prev_hole(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -425,7 +423,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, offset + 1, max_pages); rcu_read_unlock(); if (!start || start - offset > max_pages) diff --git a/mm/rmap.c b/mm/rmap.c index eb477809a5c0..1e79fac3186b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free in this function as call of try_to_unmap() - * must hold a reference on the page. + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. */ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &start, &end); + } mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; + if (PageHuge(page)) { + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, start, end); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && diff --git a/mm/shmem.c b/mm/shmem.c index 446942677cd4..ea26d7a0342d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -322,24 +322,20 @@ void shmem_uncharge(struct inode *inode, long pages) } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - struct radix_tree_node *node; - void __rcu **pslot; + XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); - if (!item) - return -ENOENT; + item = xas_load(&xas); if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->i_pages, node, pslot, - replacement, NULL); + xas_store(&xas, replacement); return 0; } @@ -353,12 +349,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->i_pages, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); } /* @@ -586,9 +577,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp) { - int error, nr = hpage_nr_pages(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); + unsigned long i = 0; + unsigned long nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -600,47 +593,39 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - xa_lock_irq(&mapping->i_pages); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->i_pages, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; + do { + void *entry; + xas_lock_irq(&xas); + entry = xas_find_conflict(&xas); + if (entry != expected) + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; +next: + xas_store(&xas, page + i); + if (++i < nr) { + xas_next(&xas); + goto next; } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->i_pages, - index + i, page + i); - VM_BUG_ON(error); - } + if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(page, NR_SHMEM_THPS); } - } else if (!expected) { - error = radix_tree_insert(&mapping->i_pages, index, page); - } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); - } - - if (!error) { mapping->nrpages += nr; - if (PageTransHuge(page)) - __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - xa_unlock_irq(&mapping->i_pages); - } else { +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { page->mapping = NULL; - xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); + return xas_error(&xas); } - return error; + + return 0; } /* @@ -654,7 +639,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); xa_lock_irq(&mapping->i_pages); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); @@ -665,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) @@ -673,7 +658,7 @@ static int shmem_free_swap(struct address_space *mapping, void *old; xa_lock_irq(&mapping->i_pages); - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0); xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; @@ -691,29 +676,19 @@ static int shmem_free_swap(struct address_space *mapping, unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void __rcu **slot; + XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, end - 1) { + if (xas_retry(&xas, page)) continue; - } - - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -788,7 +763,7 @@ void shmem_unlock_mapping(struct address_space *mapping) } /* - * Remove range of pages and swap entries from radix tree, and free them. + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, @@ -824,7 +799,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -921,7 +896,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1110,34 +1085,27 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +static unsigned long find_swap_entry(struct xarray *xa, void *item) { - struct radix_tree_iter iter; - void __rcu **slot; - unsigned long found = -1; + XA_STATE(xas, xa, 0); unsigned int checked = 0; + void *entry; rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) continue; - } - if (entry == item) { - found = iter.index; + if (entry == item) break; - } checked++; - if ((checked % 4096) != 0) + if ((checked % XA_CHECK_SCHED) != 0) continue; - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } - rcu_read_unlock(); - return found; + + return entry ? xas.xa_index : -1; } /* @@ -1175,10 +1143,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, * We needed to drop mutex to make that restrictive page * allocation, but the inode might have been freed while we * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock + * complete without emptying the page cache, our page lock * on this swapcache page is not enough to prevent that - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. + * trylock_page(), removing swap from page cache whatever. * * We must not proceed to shmem_add_to_page_cache() if the * inode has been freed, but of course we cannot rely on @@ -1200,7 +1168,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); + radswap, gfp); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1244,7 +1212,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) &memcg, false); if (error) goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ + /* No memory allocation: swap entry occupies the slot for the page */ error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); @@ -1453,27 +1421,21 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; struct page *page; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return NULL; hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1578,8 +1540,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); @@ -1643,7 +1604,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, repeat: swap.val = 0; page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swap = radix_to_swp_entry(page); page = NULL; } @@ -1718,7 +1679,7 @@ repeat: false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); + swp_to_radix_entry(swap), gfp); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1824,13 +1785,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, PageTransHuge(page)); if (error) goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); if (error) { mem_cgroup_cancel_charge(page, memcg, PageTransHuge(page)); @@ -1931,7 +1887,7 @@ unlock: spin_unlock_irq(&info->lock); goto repeat; } - if (error == -EEXIST) /* from above or from radix_tree_insert */ + if (error == -EEXIST) goto repeat; return error; } @@ -2299,11 +2255,8 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK); if (ret) goto out_release_uncharge; @@ -2548,7 +2501,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + * llseek SEEK_DATA or SEEK_HOLE through the page cache. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pgoff_t index, pgoff_t end, int whence) @@ -2578,7 +2531,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } diff --git a/mm/slab.c b/mm/slab.c index aa76a70e087e..2a5654bb3b3f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); @@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } @@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; diff --git a/mm/slab_common.c b/mm/slab_common.c index fea3376f9816..7eb8dc136c1c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { unsigned int index; - if (unlikely(size > KMALLOC_MAX_SIZE)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - if (size <= 192) { if (!size) return ZERO_SIZE_PTR; index = size_index[size_index_elem(size)]; - } else + } else { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + WARN_ON(1); + return NULL; + } index = fls(size - 1); + } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { {"kmalloc-16", 16}, {"kmalloc-32", 32}, {"kmalloc-64", 64}, {"kmalloc-128", 128}, {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, - {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, - {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, - {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, - {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, - {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, - {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, - {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, - {"kmalloc-67108864", 67108864} + {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, + {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, + {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, + {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, + {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, + {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, + {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, + {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, + {"kmalloc-64M", 67108864} }; /* @@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, slab_flags_t flags) +static const char * +kmalloc_cache_name(const char *prefix, unsigned int size) +{ + + static const char units[3] = "\0kM"; + int idx = 0; + + while (size >= 1024 && (size % 1024 == 0)) { + size /= 1024; + idx++; + } + + return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); +} + +static void __init +new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kmalloc_cache_name("kmalloc-rcl", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ @@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { unsigned int size = kmalloc_size(i); - char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%u", size); + const char *n = kmalloc_cache_name("dma-kmalloc", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif diff --git a/mm/slub.c b/mm/slub.c index 8da34a8af53d..e3629cd7aff1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1276,16 +1276,54 @@ out: __setup("slub_debug", setup_slub_debug); +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * @ctor: constructor function + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=,, ... + * then only the select slabs will receive the debug option(s). + */ slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) - flags |= slub_debug; + char *iter; + size_t len; + + /* If slub_debug = 0, it folds into the if conditional. */ + if (!slub_debug_slabs) + return flags | slub_debug; + + len = strlen(name); + iter = slub_debug_slabs; + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchr(iter, ','); + if (!end) + end = iter + strlen(iter); + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= slub_debug; + break; + } + + if (!*end) + break; + iter = end + 1; + } return flags; } @@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), - sizeof(long), - GFP_ATOMIC); + unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); if (!map) return; slab_err(s, page, text, s->name); @@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); - kfree(map); + bitmap_free(map); #endif } @@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map) return -ENOMEM; @@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s) flush_all(s); for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - kfree(map); + bitmap_free(map); return count; } /* @@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - kfree(map); + bitmap_free(map); return sprintf(buf, "Out of memory\n"); } /* Push back cpu slabs */ @@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); - kfree(map); + bitmap_free(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; @@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4669,7 +4702,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4678,33 +4711,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8301293331a2..7fec05796796 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,7 +20,7 @@ */ #include #include -#include +#include #include #include #include @@ -42,8 +42,8 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid_raw(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, node); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..33307fc05c4d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -68,7 +68,8 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) if (slab_is_available()) section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = memblock_virt_alloc_node(array_size, nid); + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); return section; } @@ -216,7 +217,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); + mem_section = memblock_alloc(size, align); } #endif @@ -306,7 +307,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_virt_alloc_try_nid_nopanic(size, + p = memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { @@ -362,7 +363,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -391,9 +392,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, if (map) return map; - map = memblock_virt_alloc_try_nid(size, + map = memblock_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -405,9 +406,9 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = - memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + memblock_alloc_try_nid_raw(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } @@ -696,13 +697,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } -#ifdef CONFIG_DEBUG_VM /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); -#endif + page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..aa483719922e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -965,7 +964,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; @@ -1002,7 +1001,7 @@ EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag) + xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); @@ -1012,7 +1011,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages) + xa_mark_t tag, unsigned max_pages) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..fd2f21e1c60a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -107,14 +107,15 @@ void show_swap_cache_info(void) } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) { - int error, i, nr = hpage_nr_pages(page); - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); + unsigned long i, nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -123,73 +124,52 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) page_ref_add(page, nr); SetPageSwapCache(page); - address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - for (i = 0; i < nr; i++) { - set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->i_pages, - idx + i, page + i); - if (unlikely(error)) - break; - } - if (likely(!error)) { + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + set_page_private(page + i, entry.val + i); + xas_store(&xas, page + i); + xas_next(&xas); + } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); - } else { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); - set_page_private(page + i, 0UL); - while (i--) { - radix_tree_delete(&address_space->i_pages, idx + i); - set_page_private(page + i, 0UL); - } - ClearPageSwapCache(page); - page_ref_sub(page, nr); - } - xa_unlock_irq(&address_space->i_pages); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); - return error; -} - - -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) -{ - int error; + if (!xas_error(&xas)) + return 0; - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } - return error; + ClearPageSwapCache(page); + page_ref_sub(page, nr); + return xas_error(&xas); } /* * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); - swp_entry_t entry; - pgoff_t idx; + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); - entry.val = page_private(page); - address_space = swap_address_space(entry); - idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->i_pages, idx + i); + void *entry = xas_store(&xas, NULL); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); + xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; @@ -217,7 +197,7 @@ int add_to_swap(struct page *page) return 0; /* - * Radix-tree node allocations from PF_MEMALLOC contexts could + * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * @@ -229,7 +209,6 @@ int add_to_swap(struct page *page) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - /* -ENOMEM radix-tree allocation failure */ if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -263,14 +242,11 @@ fail: */ void delete_from_swap_cache(struct page *page) { - swp_entry_t entry; - struct address_space *address_space; + swp_entry_t entry = { .val = page_private(page) }; + struct address_space *address_space = swap_address_space(entry); - entry.val = page_private(page); - - address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, entry); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); @@ -413,19 +389,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, break; /* Out of memory */ } - /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@ -433,26 +401,20 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ cond_resched(); continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + } else if (err) /* swp entry is obsolete ? */ break; - } - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); - /* - * Initiate read into locked page and return. - */ + /* Initiate read into locked page */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -625,7 +587,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ diff --git a/mm/swapfile.c b/mm/swapfile.c index d954b71c4f9c..644f746e167a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* Reclaim the swap entry anyway if possible */ +#define TTRS_ANYWAY 0x1 +/* + * Reclaim the swap entry if there are no more mappings of the + * corresponding page + */ +#define TTRS_UNMAPPED 0x2 +/* Reclaim the swap entry if swap is getting full*/ +#define TTRS_FULL 0x4 + /* returns 1 if swap entry is freed */ -static int -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + page = find_get_page(swap_address_space(entry), offset); if (!page) return 0; /* - * This function is called from scan_swap_map() and it's called - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. - * We have to use trylock for avoiding deadlock. This is a special + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use try_to_free_swap() with explicit lock_page() * in usual operations. */ if (trylock_page(page)) { - ret = try_to_free_swap(page); + if ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) + ret = try_to_free_swap(page); unlock_page(page); } put_page(page); @@ -780,7 +793,7 @@ checks: int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) @@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) struct swap_cluster_info *ci; ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -989,7 +1003,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FILE)) + if (!(si->flags & SWP_FS)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); + if (!usage) + free_swap_slot(entry); return usage; } @@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, 1)) - free_swap_slot(entry); - } + if (p) + __swap_entry_free(p, entry, 1); } /* @@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); @@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page) int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; - struct page *page = NULL; unsigned char count; if (non_swap_entry(entry)) @@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { count = __swap_entry_free(p, entry, 1); if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) { - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); - if (page && !trylock_page(page)) { - put_page(page); - page = NULL; - } - } else if (!count) - free_swap_slot(entry); - } - if (page) { - /* - * Not mapped elsewhere, or swap space full? Free it! - * Also recheck PageSwapCache now page is locked (above). - */ - if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_page_trans_huge_swapped(p, entry)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } - unlock_page(page); - put_page(page); + !swap_page_trans_huge_swapped(p, entry)) + __try_to_reclaim_swap(p, swp_offset(entry), + TTRS_UNMAPPED | TTRS_FULL); } return p != NULL; } @@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) kfree(se); } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_FILE; - mapping->a_ops->swap_deactivate(swap_file); + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); } } @@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } +EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages @@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret >= 0) + sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FILE; + sis->flags |= SWP_FS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } diff --git a/mm/truncate.c b/mm/truncate.c index 1d2fb2dca96f..45d68e90b703 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -33,15 +33,12 @@ static inline void __clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_node *node; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); - if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) + xas_set_update(&xas, workingset_update_node); + if (xas_load(&xas) != entry) return; - if (*slot != entry) - return; - __radix_tree_replace(&mapping->i_pages, node, slot, NULL, - workingset_update_node); + xas_store(&xas, NULL); mapping->nrexceptional--; } @@ -70,7 +67,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; for (j = 0; j < pagevec_count(pvec); j++) - if (radix_tree_exceptional_entry(pvec->pages[j])) + if (xa_is_value(pvec->pages[j])) break; if (j == pagevec_count(pvec)) @@ -85,7 +82,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, struct page *page = pvec->pages[i]; pgoff_t index = indices[i]; - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { pvec->pages[j++] = page; continue; } @@ -347,7 +344,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!trylock_page(page)) @@ -442,7 +439,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; lock_page(page); @@ -561,7 +558,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -692,7 +689,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; @@ -738,10 +735,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping, index++; } /* - * For DAX we invalidate page tables after invalidating radix tree. We + * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't - * work as we have no cheap way to find whether radix tree entry didn't + * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { diff --git a/mm/util.c b/mm/util.c index 9e3ebd2ef65f..8bf08b5b5760 100644 --- a/mm/util.c +++ b/mm/util.c @@ -15,17 +15,10 @@ #include #include -#include #include #include "internal.h" -static inline int is_kernel_rodata(unsigned long addr) -{ - return addr >= (unsigned long)__start_rodata && - addr < (unsigned long)__end_rodata; -} - /** * kfree_const - conditionally free memory * @x: pointer to the memory @@ -442,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node); * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Any context except NMI. + * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { @@ -685,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a728fc492557..97d4b25d0373 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr) * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * + * May sleep if called *not* from interrupt context. + * * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) @@ -1585,6 +1587,8 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; if (unlikely(in_interrupt())) diff --git a/mm/vmscan.c b/mm/vmscan.c index c7ce2c161225..62ac0c488624 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority @@ -580,8 +590,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@ -677,9 +687,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); @@ -742,12 +751,12 @@ static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -923,7 +932,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { @@ -2146,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } @@ -2457,9 +2467,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: @@ -3303,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3331,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3498,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3508,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@ -3609,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ba0870ecddd..6038ce593ce3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", + "workingset_nodes", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1275,6 +1277,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#else + "", /* nr_tlb_remote_flush */ + "", /* nr_tlb_remote_flush_received */ #endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", @@ -1283,7 +1288,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", - "vmacache_full_flushes", #endif #ifdef CONFIG_SWAP "swap_ra", @@ -1661,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) stat_items_size += sizeof(struct vm_event_state); #endif + BUILD_BUG_ON(stat_items_size != + ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) @@ -1704,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..d46f8c92aa2f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -148,21 +160,20 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -170,23 +181,27 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + eviction = (eviction << 1) | workingset; - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; + bool workingset; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +279,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** @@ -340,7 +366,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -350,12 +376,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -364,12 +398,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -384,20 +418,26 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -410,11 +450,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -422,15 +462,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -440,6 +479,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* @@ -447,29 +488,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); @@ -491,7 +524,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9da65552e7ca..0787d33b80d8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fallthru */ + case ZPOOL_MM_RW: /* fall through */ default: zs_mm = ZS_MM_RW; break; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 546af0e73ac3..ff720f1ebf73 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) return; vlan->netpoll = NULL; - - __netpoll_free_async(netpoll); + __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/9p/Makefile b/net/9p/Makefile index c0486cfc85d9..aa0a5641e5d0 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o mod.o \ client.o \ error.o \ - util.o \ protocol.o \ trans_fd.o \ trans_common.o \ diff --git a/net/9p/client.c b/net/9p/client.c index deae53a7dffc..2c9a17b9b46b 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -231,144 +231,170 @@ free_and_return: return ret; } -static struct p9_fcall *p9_fcall_alloc(int alloc_msize) +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, + int alloc_msize) { - struct p9_fcall *fc; - fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); - if (!fc) - return NULL; + if (likely(c->fcall_cache) && alloc_msize == c->msize) { + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); + fc->cache = c->fcall_cache; + } else { + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } + if (!fc->sdata) + return -ENOMEM; fc->capacity = alloc_msize; - fc->sdata = (char *) fc + sizeof(struct p9_fcall); - return fc; + return 0; +} + +void p9_fcall_fini(struct p9_fcall *fc) +{ + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us + */ + if (unlikely(!fc->sdata)) + return; + + if (fc->cache) + kmem_cache_free(fc->cache, fc->sdata); + else + kfree(fc->sdata); } +EXPORT_SYMBOL(p9_fcall_fini); + +static struct kmem_cache *p9_req_cache; /** - * p9_tag_alloc - lookup/allocate a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction - * - * this is a simple array lookup, but will grow the - * request_slots as necessary to accommodate transaction - * ids which did not previously have a slot. - * - * this code relies on the client spinlock to manage locks, its - * possible we should switch to something else, but I'd rather - * stick with something low-overhead for the common case. + * p9_req_alloc - Allocate a new request. + * @c: Client session. + * @type: Transaction type. + * @max_size: Maximum packet size for this request. * + * Context: Process context. + * Return: Pointer to new request. */ - static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) +p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) { - unsigned long flags; - int row, col; - struct p9_req_t *req; + struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); int alloc_msize = min(c->msize, max_size); + int tag; - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) { - spin_lock_irqsave(&c->lock, flags); - /* check again since original check was outside of lock */ - while (tag >= c->max_tag) { - row = (tag / P9_ROW_MAXTAG); - c->reqs[row] = kcalloc(P9_ROW_MAXTAG, - sizeof(struct p9_req_t), GFP_ATOMIC); - - if (!c->reqs[row]) { - pr_err("Couldn't grow tag array\n"); - spin_unlock_irqrestore(&c->lock, flags); - return ERR_PTR(-ENOMEM); - } - for (col = 0; col < P9_ROW_MAXTAG; col++) { - req = &c->reqs[row][col]; - req->status = REQ_STATUS_IDLE; - init_waitqueue_head(&req->wq); - } - c->max_tag += P9_ROW_MAXTAG; - } - spin_unlock_irqrestore(&c->lock, flags); - } - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; - - req = &c->reqs[row][col]; - if (!req->tc) - req->tc = p9_fcall_alloc(alloc_msize); - if (!req->rc) - req->rc = p9_fcall_alloc(alloc_msize); - if (!req->tc || !req->rc) - goto grow_failed; + if (!req) + return ERR_PTR(-ENOMEM); - p9pdu_reset(req->tc); - p9pdu_reset(req->rc); + if (p9_fcall_init(c, &req->tc, alloc_msize)) + goto free_req; + if (p9_fcall_init(c, &req->rc, alloc_msize)) + goto free; - req->tc->tag = tag-1; + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); req->status = REQ_STATUS_ALLOC; + init_waitqueue_head(&req->wq); + INIT_LIST_HEAD(&req->req_list); + + idr_preload(GFP_NOFS); + spin_lock_irq(&c->lock); + if (type == P9_TVERSION) + tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, + GFP_NOWAIT); + else + tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); + req->tc.tag = tag; + spin_unlock_irq(&c->lock); + idr_preload_end(); + if (tag < 0) + goto free; + + /* Init ref to two because in the general case there is one ref + * that is put asynchronously by a writer thread, one ref + * temporarily given by p9_tag_lookup and put by p9_client_cb + * in the recv thread, and one ref put by p9_tag_remove in the + * main thread. The only exception is virtio that does not use + * p9_tag_lookup but does not have a writer thread either + * (the write happens synchronously in the request/zc_request + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ + refcount_set(&req->refcount.refcount, 2); return req; -grow_failed: - pr_err("Couldn't grow tag array\n"); - kfree(req->tc); - kfree(req->rc); - req->tc = req->rc = NULL; +free: + p9_fcall_fini(&req->tc); + p9_fcall_fini(&req->rc); +free_req: + kmem_cache_free(p9_req_cache, req); return ERR_PTR(-ENOMEM); } /** - * p9_tag_lookup - lookup a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction + * p9_tag_lookup - Look up a request by tag. + * @c: Client session. + * @tag: Transaction ID. * + * Context: Any context. + * Return: A request, or %NULL if there is no request with that tag. */ - struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) { - int row, col; - - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) - return NULL; + struct p9_req_t *req; - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; + rcu_read_lock(); +again: + req = idr_find(&c->reqs, tag); + if (req) { + /* We have to be careful with the req found under rcu_read_lock + * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the + * ref again without corrupting other data, then check again + * that the tag matches once we have the ref + */ + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { + p9_req_put(req); + goto again; + } + } + rcu_read_unlock(); - return &c->reqs[row][col]; + return req; } EXPORT_SYMBOL(p9_tag_lookup); /** - * p9_tag_init - setup tags structure and contents - * @c: v9fs client struct - * - * This initializes the tags structure for each client instance. + * p9_tag_remove - Remove a tag. + * @c: Client session. + * @r: Request of reference. * + * Context: Any context. */ +static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) +{ + unsigned long flags; + u16 tag = r->tc.tag; + + p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); + return p9_req_put(r); +} -static int p9_tag_init(struct p9_client *c) +static void p9_req_free(struct kref *ref) { - int err = 0; + struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); + p9_fcall_fini(&r->tc); + p9_fcall_fini(&r->rc); + kmem_cache_free(p9_req_cache, r); +} - c->tagpool = p9_idpool_create(); - if (IS_ERR(c->tagpool)) { - err = PTR_ERR(c->tagpool); - goto error; - } - err = p9_idpool_get(c->tagpool); /* reserve tag 0 */ - if (err < 0) { - p9_idpool_destroy(c->tagpool); - goto error; - } - c->max_tag = 0; -error: - return err; +int p9_req_put(struct p9_req_t *r) +{ + return kref_put(&r->refcount, p9_req_free); } +EXPORT_SYMBOL(p9_req_put); /** * p9_tag_cleanup - cleans up tags structure and reclaims resources @@ -379,52 +405,17 @@ error: */ static void p9_tag_cleanup(struct p9_client *c) { - int row, col; - - /* check to insure all requests are idle */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - if (c->reqs[row][col].status != REQ_STATUS_IDLE) { - p9_debug(P9_DEBUG_MUX, - "Attempting to cleanup non-free tag %d,%d\n", - row, col); - /* TODO: delay execution of cleanup */ - return; - } - } - } - - if (c->tagpool) { - p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */ - p9_idpool_destroy(c->tagpool); - } + struct p9_req_t *req; + int id; - /* free requests associated with tags */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - kfree(c->reqs[row][col].tc); - kfree(c->reqs[row][col].rc); - } - kfree(c->reqs[row]); + rcu_read_lock(); + idr_for_each_entry(&c->reqs, req, id) { + pr_info("Tag %d still in use\n", id); + if (p9_tag_remove(c, req) == 0) + pr_warn("Packet with tag %d has still references", + req->tc.tag); } - c->max_tag = 0; -} - -/** - * p9_free_req - free a request and clean-up as necessary - * c: client state - * r: request to release - * - */ - -static void p9_free_req(struct p9_client *c, struct p9_req_t *r) -{ - int tag = r->tc->tag; - p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); - - r->status = REQ_STATUS_IDLE; - if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool)) - p9_idpool_put(tag, c->tagpool); + rcu_read_unlock(); } /** @@ -435,7 +426,7 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { - p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); /* * This barrier is needed to make sure any change made to req before @@ -445,7 +436,8 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) req->status = status; wake_up(&req->wq); - p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); + p9_req_put(req); } EXPORT_SYMBOL(p9_client_cb); @@ -516,18 +508,18 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int err; int ecode; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); - if (req->rc->size >= c->msize) { + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); + if (req->rc.size >= c->msize) { p9_debug(P9_DEBUG_ERROR, "requested packet size too big: %d\n", - req->rc->size); + req->rc.size); return -EIO; } /* * dump the response from server * This should be after check errors which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -537,7 +529,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (!p9_is_proto_dotl(c)) { char *ename; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -553,7 +545,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -587,12 +579,12 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, int8_t type; char *ename = NULL; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); /* * dump the response from server * This should be after parse_header which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -607,13 +599,13 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, /* 7 = header size for RERROR; */ int inline_len = in_hdrlen - 7; - len = req->rc->size - req->rc->offset; + len = req->rc.size - req->rc.offset; if (len > (P9_ZC_HDR_SZ - 7)) { err = -EFAULT; goto out_err; } - ename = &req->rc->sdata[req->rc->offset]; + ename = &req->rc.sdata[req->rc.offset]; if (len > inline_len) { /* We have error in external buffer */ if (!copy_from_iter_full(ename + inline_len, @@ -623,7 +615,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } } ename = NULL; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -639,7 +631,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -672,7 +664,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) int16_t oldtag; int err; - err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1); + err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); if (err) return err; @@ -686,11 +678,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) * if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_SENT) + if (oldreq->status == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); + } - p9_free_req(c, req); + p9_tag_remove(c, req); return 0; } @@ -698,7 +691,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, int8_t type, int req_size, const char *fmt, va_list ap) { - int tag, err; + int err; struct p9_req_t *req; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -711,27 +704,22 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) return ERR_PTR(-EIO); - tag = P9_NOTAG; - if (type != P9_TVERSION) { - tag = p9_idpool_get(c->tagpool); - if (tag < 0) - return ERR_PTR(-ENOMEM); - } - - req = p9_tag_alloc(c, tag, req_size); + req = p9_tag_alloc(c, type, req_size); if (IS_ERR(req)) return req; /* marshall the data */ - p9pdu_prepare(req->tc, tag, type); - err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); + p9pdu_prepare(&req->tc, req->tc.tag, type); + err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); if (err) goto reterr; - p9pdu_finalize(c, req->tc); - trace_9p_client_req(c, type, tag); + p9pdu_finalize(c, &req->tc); + trace_9p_client_req(c, type, req->tc.tag); return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); + /* We have to put also the 2nd reference as it won't be used */ + p9_req_put(req); return ERR_PTR(err); } @@ -741,7 +729,7 @@ reterr: * @type: type of request * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t * @@ -766,6 +754,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) err = c->trans_mod->request(c, req); if (err < 0) { + /* write won't happen */ + p9_req_put(req); if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto recalc_sigpending; @@ -813,11 +803,11 @@ recalc_sigpending: goto reterr; err = p9_check_errors(c, req); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -832,7 +822,7 @@ reterr: * @hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, struct iov_iter *uidata, @@ -895,11 +885,11 @@ recalc_sigpending: goto reterr; err = p9_check_zc_errors(c, req, uidata, in_hdrlen); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -978,10 +968,10 @@ static int p9_client_version(struct p9_client *c) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); + err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); if (err) { p9_debug(P9_DEBUG_9P, "version error %d\n", err); - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); goto error; } @@ -1002,7 +992,7 @@ static int p9_client_version(struct p9_client *c) error: kfree(version); - p9_free_req(c, req); + p9_tag_remove(c, req); return err; } @@ -1020,20 +1010,18 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) clnt->trans_mod = NULL; clnt->trans = NULL; + clnt->fcall_cache = NULL; client_id = utsname()->nodename; memcpy(clnt->name, client_id, strlen(client_id) + 1); spin_lock_init(&clnt->lock); idr_init(&clnt->fids); - - err = p9_tag_init(clnt); - if (err < 0) - goto free_client; + idr_init(&clnt->reqs); err = parse_opts(options, clnt); if (err < 0) - goto destroy_tagpool; + goto free_client; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); @@ -1042,7 +1030,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) err = -EPROTONOSUPPORT; p9_debug(P9_DEBUG_ERROR, "No transport defined or default transport\n"); - goto destroy_tagpool; + goto free_client; } p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", @@ -1059,14 +1047,21 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (err) goto close_trans; + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ + clnt->fcall_cache = + kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, + 0, 0, P9_HDRSZ + 4, + clnt->msize - (P9_HDRSZ + 4), + NULL); + return clnt; close_trans: clnt->trans_mod->close(clnt); put_trans: v9fs_put_trans(clnt->trans_mod); -destroy_tagpool: - p9_idpool_destroy(clnt->tagpool); free_client: kfree(clnt); return ERR_PTR(err); @@ -1092,6 +1087,7 @@ void p9_client_destroy(struct p9_client *clnt) p9_tag_cleanup(clnt); + kmem_cache_destroy(clnt->fcall_cache); kfree(clnt); } EXPORT_SYMBOL(p9_client_destroy); @@ -1135,10 +1131,10 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1147,7 +1143,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, memmove(&fid->qid, &qid, sizeof(struct p9_qid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return fid; error: @@ -1192,13 +1188,13 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); + err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); @@ -1259,9 +1255,9 @@ int p9_client_open(struct p9_fid *fid, int mode) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1273,7 +1269,7 @@ int p9_client_open(struct p9_fid *fid, int mode) fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1303,9 +1299,9 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1318,7 +1314,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 ofid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1348,9 +1344,9 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1363,7 +1359,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1387,9 +1383,9 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1397,7 +1393,7 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, qid->type, (unsigned long long)qid->path, qid->version); free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1417,7 +1413,7 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newna return PTR_ERR(req); p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return 0; } EXPORT_SYMBOL(p9_client_link); @@ -1441,7 +1437,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync) p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; @@ -1476,7 +1472,7 @@ again: p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: /* * Fid is not valid even after a failed clunk @@ -1510,7 +1506,7 @@ int p9_client_remove(struct p9_fid *fid) p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: if (err == -ERESTARTSYS) p9_client_clunk(fid); @@ -1537,7 +1533,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) } p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1585,11 +1581,11 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, + *err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1599,7 +1595,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } @@ -1609,7 +1605,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) offset += n; if (n != count) { *err = -EFAULT; - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } } else { @@ -1617,7 +1613,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) total += count; offset += count; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); } return total; } @@ -1658,10 +1654,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); + *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &count); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1671,7 +1667,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); iov_iter_advance(from, count); total += count; offset += count; @@ -1702,10 +1698,10 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1722,7 +1718,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) from_kgid(&init_user_ns, ret->n_gid), from_kuid(&init_user_ns, ret->n_muid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1755,10 +1751,10 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1783,7 +1779,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, ret->st_gen, ret->st_data_version); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1852,7 +1848,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1884,7 +1880,7 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) goto error; } p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1907,12 +1903,12 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, - &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, - &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); + err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, + &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, + &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1923,7 +1919,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, sb->fsid, (long int)sb->namelen); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1951,7 +1947,7 @@ int p9_client_rename(struct p9_fid *fid, p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1981,7 +1977,7 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", newdirfid->fid, new_name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2015,13 +2011,13 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, err = PTR_ERR(req); goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size); + err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", attr_fid->fid, *attr_size); return attr_fid; @@ -2055,7 +2051,7 @@ int p9_client_xattrcreate(struct p9_fid *fid, const char *name, goto error; } p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2070,7 +2066,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; - iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count); + iov_iter_kvec(&to, READ, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, (unsigned long long) offset, count); @@ -2103,9 +2099,9 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); + err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } if (rsize < count) { @@ -2118,11 +2114,11 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) if (non_zc) memmove(data, dataptr, count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return count; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2144,16 +2140,16 @@ int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2175,16 +2171,16 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2210,14 +2206,14 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); + err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2241,18 +2237,18 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type, - &glock->start, &glock->length, &glock->proc_id, - &glock->client_id); + err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " "proc_id %d client_id %s\n", glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_getlock_dotl); @@ -2271,14 +2267,25 @@ int p9_client_readlink(struct p9_fid *fid, char **target) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); + err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_readlink); + +int __init p9_client_init(void) +{ + p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); + return p9_req_cache ? 0 : -ENOMEM; +} + +void __exit p9_client_exit(void) +{ + kmem_cache_destroy(p9_req_cache); +} diff --git a/net/9p/mod.c b/net/9p/mod.c index 253ba824a325..0da56d6af73b 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -171,11 +171,17 @@ void v9fs_put_trans(struct p9_trans_module *m) */ static int __init init_p9(void) { + int ret; + + ret = p9_client_init(); + if (ret) + return ret; + p9_error_init(); pr_info("Installing 9P2000 support\n"); p9_trans_fd_init(); - return 0; + return ret; } /** @@ -188,6 +194,7 @@ static void __exit exit_p9(void) pr_info("Unloading 9P2000 support\n"); p9_trans_fd_exit(); + p9_client_exit(); } module_init(init_p9) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 4a1e1dd30b52..462ba144cb39 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -46,10 +46,15 @@ p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); void p9stat_free(struct p9_wstat *stbuf) { kfree(stbuf->name); + stbuf->name = NULL; kfree(stbuf->uid); + stbuf->uid = NULL; kfree(stbuf->gid); + stbuf->gid = NULL; kfree(stbuf->muid); + stbuf->muid = NULL; kfree(stbuf->extension); + stbuf->extension = NULL; } EXPORT_SYMBOL(p9stat_free); @@ -566,9 +571,10 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st) if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); + return ret; } - return ret; + return fake_pdu.offset; } EXPORT_SYMBOL(p9stat_read); @@ -617,13 +623,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len, if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); - goto out; + return ret; } - strcpy(dirent->d_name, nameptr); + ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name)); + if (ret < 0) { + p9_debug(P9_DEBUG_ERROR, + "On the wire dirent name too long: %s\n", + nameptr); + kfree(nameptr); + return ret; + } kfree(nameptr); -out: return fake_pdu.offset; } EXPORT_SYMBOL(p9dirent_read); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e2ef3c782c53..f868cf6fba79 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -131,7 +131,8 @@ struct p9_conn { int err; struct list_head req_list; struct list_head unsent_req_list; - struct p9_req_t *req; + struct p9_req_t *rreq; + struct p9_req_t *wreq; char tmp_buf[7]; struct p9_fcall rc; int wpos; @@ -291,7 +292,6 @@ static void p9_read_work(struct work_struct *work) __poll_t n; int err; struct p9_conn *m; - int status = REQ_STATUS_ERROR; m = container_of(work, struct p9_conn, rq); @@ -322,7 +322,7 @@ static void p9_read_work(struct work_struct *work) m->rc.offset += err; /* header read in */ - if ((!m->req) && (m->rc.offset == m->rc.capacity)) { + if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ @@ -346,23 +346,23 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, m->rc.tag); - if (!m->req || (m->req->status != REQ_STATUS_SENT)) { + m->rreq = p9_tag_lookup(m->client, m->rc.tag); + if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", m->rc.tag); err = -EIO; goto error; } - if (m->req->rc == NULL) { + if (!m->rreq->rc.sdata) { p9_debug(P9_DEBUG_ERROR, "No recv fcall for tag %d (req %p), disconnecting!\n", - m->rc.tag, m->req); - m->req = NULL; + m->rc.tag, m->rreq); + m->rreq = NULL; err = -EIO; goto error; } - m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + m->rc.sdata = m->rreq->rc.sdata; memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); m->rc.capacity = m->rc.size; } @@ -370,20 +370,27 @@ static void p9_read_work(struct work_struct *work) /* packet is read in * not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rc.offset == m->rc.capacity)) { + if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); - m->req->rc->size = m->rc.offset; + m->rreq->rc.size = m->rc.offset; spin_lock(&m->client->lock); - if (m->req->status != REQ_STATUS_ERROR) - status = REQ_STATUS_RCVD; - list_del(&m->req->req_list); - /* update req->status while holding client->lock */ - p9_client_cb(m->client, m->req, status); + if (m->rreq->status == REQ_STATUS_SENT) { + list_del(&m->rreq->req_list); + p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); + } else { + spin_unlock(&m->client->lock); + p9_debug(P9_DEBUG_ERROR, + "Request tag %d errored out while we were reading the reply\n", + m->rc.tag); + err = -EIO; + goto error; + } spin_unlock(&m->client->lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; - m->req = NULL; + p9_req_put(m->rreq); + m->rreq = NULL; } end_clear: @@ -469,9 +476,11 @@ static void p9_write_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "move req %p\n", req); list_move_tail(&req->req_list, &m->req_list); - m->wbuf = req->tc->sdata; - m->wsize = req->tc->size; + m->wbuf = req->tc.sdata; + m->wsize = req->tc.size; m->wpos = 0; + p9_req_get(req); + m->wreq = req; spin_unlock(&m->client->lock); } @@ -492,8 +501,11 @@ static void p9_write_work(struct work_struct *work) } m->wpos += err; - if (m->wpos == m->wsize) + if (m->wpos == m->wsize) { m->wpos = m->wsize = 0; + p9_req_put(m->wreq); + m->wreq = NULL; + } end_clear: clear_bit(Wworksched, &m->wsched); @@ -663,7 +675,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", - m, current, req->tc, req->tc->id); + m, current, &req->tc, req->tc.id); if (m->err < 0) return m->err; @@ -694,6 +706,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; + p9_req_put(req); ret = 0; } spin_unlock(&client->lock); @@ -711,6 +724,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) spin_lock(&client->lock); list_del(&req->req_list); spin_unlock(&client->lock); + p9_req_put(req); return 0; } @@ -862,7 +876,15 @@ static void p9_conn_destroy(struct p9_conn *m) p9_mux_poll_stop(m); cancel_work_sync(&m->rq); + if (m->rreq) { + p9_req_put(m->rreq); + m->rreq = NULL; + } cancel_work_sync(&m->wq); + if (m->wreq) { + p9_req_put(m->wreq); + m->wreq = NULL; + } p9_conn_cancel(m, -ECONNRESET); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b513cffeeb3c..119103bfa82e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -122,7 +122,7 @@ struct p9_rdma_context { dma_addr_t busa; union { struct p9_req_t *req; - struct p9_fcall *rc; + struct p9_fcall rc; }; }; @@ -274,8 +274,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; - if (c) - c->status = Disconnected; + c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: @@ -320,8 +319,8 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto err_out; - c->rc->size = wc->byte_len; - err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + c->rc.size = wc->byte_len; + err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; @@ -331,12 +330,13 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) /* Check that we have not yet received a reply for this request. */ - if (unlikely(req->rc)) { + if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } - req->rc = c->rc; + req->rc.size = c->rc.size; + req->rc.sdata = c->rc.sdata; p9_client_cb(client, req, REQ_STATUS_RCVD); out: @@ -361,9 +361,10 @@ send_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, - c->busa, c->req->tc->size, + c->busa, c->req->tc.size, DMA_TO_DEVICE); up(&rdma->sq_sem); + p9_req_put(c->req); kfree(c); } @@ -401,7 +402,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->rc->sdata, client->msize, + c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; @@ -443,9 +444,9 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { - /* Got one ! */ - kfree(req->rc); - req->rc = NULL; + /* Got one! */ + p9_fcall_fini(&req->rc); + req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ @@ -459,7 +460,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = -ENOMEM; goto recv_error; } - rpl_context->rc = req->rc; + rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure @@ -475,11 +476,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = post_recv(client, rpl_context); if (err) { - p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); + p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ - req->rc = NULL; + req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ @@ -491,7 +492,7 @@ dont_need_post_recv: c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->req->tc->sdata, c->req->tc->size, + c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; @@ -501,7 +502,7 @@ dont_need_post_recv: c->cqe.done = send_done; sge.addr = c->busa; - sge.length = c->req->tc->size; + sge.length = c->req->tc.size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; @@ -544,7 +545,7 @@ dont_need_post_recv: recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); - if (rdma->state < P9_RDMA_CLOSING) { + if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 7728b0acde09..b1d39cabf125 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -155,7 +155,7 @@ static void req_done(struct virtqueue *vq) } if (len) { - req->rc->size = len; + req->rc.size = len; p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } @@ -207,6 +207,13 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } +/* Reply won't come, so drop req ref */ +static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_req_put(req); + return 0; +} + /** * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, * this takes a list of pages. @@ -273,12 +280,12 @@ req_retry: out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -322,7 +329,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, if (!iov_iter_count(data)) return 0; - if (!(data->type & ITER_KVEC)) { + if (!iov_iter_is_kvec(data)) { int n; /* * We allow only p9_max_pages pinned. We wait for the @@ -404,6 +411,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct scatterlist *sgs[4]; size_t offs; int need_drop = 0; + int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -411,29 +419,33 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != outlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); outlen = n; } /* The size field of the message must include the length of the * header and the length of the data. We didn't actually know * the length of the data until this point so add it in now. */ - sz = cpu_to_le32(req->tc->size + outlen); - memcpy(&req->tc->sdata[0], &sz, sizeof(sz)); + sz = cpu_to_le32(req->tc.size + outlen); + memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != inlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); inlen = n; } } @@ -445,7 +457,7 @@ req_retry_pinned: /* out data */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; @@ -464,7 +476,7 @@ req_retry_pinned: * alloced memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); + VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -498,6 +510,7 @@ req_retry_pinned: } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); + kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); /* @@ -518,6 +531,10 @@ err_out: } kvfree(in_pages); kvfree(out_pages); + if (!kicked) { + /* reply won't come */ + p9_req_put(req); + } return err; } @@ -750,6 +767,7 @@ static struct p9_trans_module p9_virtio_trans = { .request = p9_virtio_request, .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, + .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accomodate, address diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index c2d54ac76bfd..e2fbf3677b9b 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -141,7 +141,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) struct xen_9pfs_front_priv *priv = NULL; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; - u32 size = p9_req->tc->size; + u32 size = p9_req->tc.size; struct xen_9pfs_dataring *ring; int num; @@ -154,7 +154,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) if (!priv || priv->client != client) return -EINVAL; - num = p9_req->tc->tag % priv->num_rings; + num = p9_req->tc.tag % priv->num_rings; ring = &priv->rings[num]; again: @@ -176,7 +176,7 @@ again: masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); - xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size, &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); p9_req->status = REQ_STATUS_SENT; @@ -185,6 +185,7 @@ again: ring->intf->out_prod = prod; spin_unlock_irqrestore(&ring->lock, flags); notify_remote_via_irq(ring->irq); + p9_req_put(p9_req); return 0; } @@ -229,12 +230,12 @@ static void p9_xen_response(struct work_struct *work) continue; } - memcpy(req->rc, &h, sizeof(h)); - req->rc->offset = 0; + memcpy(&req->rc, &h, sizeof(h)); + req->rc.offset = 0; masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); /* Then, read the whole packet (including the header) */ - xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size, masked_prod, &masked_cons, XEN_9PFS_RING_SIZE); @@ -391,8 +392,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, unsigned int max_rings, max_ring_order, len = 0; versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); - if (!len) - return -EINVAL; + if (IS_ERR(versions)) + return PTR_ERR(versions); if (strcmp(versions, "1")) { kfree(versions); return -EINVAL; diff --git a/net/9p/util.c b/net/9p/util.c deleted file mode 100644 index 55ad98277e85..000000000000 --- a/net/9p/util.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * net/9p/util.c - * - * This file contains some helper functions - * - * Copyright (C) 2007 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * struct p9_idpool - per-connection accounting for tag idpool - * @lock: protects the pool - * @pool: idr to allocate tag id from - * - */ - -struct p9_idpool { - spinlock_t lock; - struct idr pool; -}; - -/** - * p9_idpool_create - create a new per-connection id pool - * - */ - -struct p9_idpool *p9_idpool_create(void) -{ - struct p9_idpool *p; - - p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL); - if (!p) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&p->lock); - idr_init(&p->pool); - - return p; -} -EXPORT_SYMBOL(p9_idpool_create); - -/** - * p9_idpool_destroy - create a new per-connection id pool - * @p: idpool to destroy - */ - -void p9_idpool_destroy(struct p9_idpool *p) -{ - idr_destroy(&p->pool); - kfree(p); -} -EXPORT_SYMBOL(p9_idpool_destroy); - -/** - * p9_idpool_get - allocate numeric id from pool - * @p: pool to allocate from - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -int p9_idpool_get(struct p9_idpool *p) -{ - int i; - unsigned long flags; - - idr_preload(GFP_NOFS); - spin_lock_irqsave(&p->lock, flags); - - /* no need to store exactly p, we just need something non-null */ - i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&p->lock, flags); - idr_preload_end(); - if (i < 0) - return -1; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p); - return i; -} -EXPORT_SYMBOL(p9_idpool_get); - -/** - * p9_idpool_put - release numeric id from pool - * @id: numeric id which is being released - * @p: pool to release id into - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -void p9_idpool_put(int id, struct p9_idpool *p) -{ - unsigned long flags; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p); - - spin_lock_irqsave(&p->lock, flags); - idr_remove(&p->pool, id); - spin_unlock_irqrestore(&p->lock, flags); -} -EXPORT_SYMBOL(p9_idpool_put); - -/** - * p9_idpool_check - check if the specified id is available - * @id: id to check - * @p: pool to check - */ - -int p9_idpool_check(int id, struct p9_idpool *p) -{ - return idr_find(&p->pool, id) != NULL; -} -EXPORT_SYMBOL(p9_idpool_check); diff --git a/net/Kconfig b/net/Kconfig index 228dfa382eec..f235edb593ba 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -300,8 +300,11 @@ config BPF_JIT config BPF_STREAM_PARSER bool "enable BPF STREAM_PARSER" + depends on INET depends on BPF_SYSCALL + depends on CGROUP_BPF select STREAM_PARSER + select NET_SOCK_MSG ---help--- Enabling this allows a stream parser to be used with BPF_MAP_TYPE_SOCKMAP. @@ -413,6 +416,14 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_SOCK_MSG + bool + default n + help + The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or + ULPs (upper layer modules, e.g. TLS) to process L7 application data + with the help of BPF programs. + config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help diff --git a/net/atm/common.c b/net/atm/common.c index 9f8cb0d2e71e..a38c174fc766 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; __poll_t mask; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 361116f77cb9..f75816f58107 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -106,3 +106,14 @@ config BATMAN_ADV_DEBUG say N here. This enables compilation of support for outputting debugging information to the kernel log. The output is controlled via the module parameter debug. + +config BATMAN_ADV_TRACING + bool "B.A.T.M.A.N. tracing support" + depends on BATMAN_ADV + depends on EVENT_TRACING + help + This is an option for use by developers; most people should + say N here. Select this option to gather traces like the debug + messages using the generic tracing infrastructure of the kernel. + BATMAN_ADV_DEBUG must also be selected to get trace events for + batadv_dbg. diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index b97ba6fb8353..9b58160fe485 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -42,6 +42,9 @@ batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o +batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o batman-adv-y += tp_meter.o batman-adv-y += translation-table.o batman-adv-y += tvlv.o + +CFLAGS_trace.o := -I$(src) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 73bf6a93a3cf..d2227091029f 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -137,169 +137,6 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) return (u8)(sum / count); } -/** - * batadv_iv_ogm_orig_free() - free the private resources allocated for this - * orig_node - * @orig_node: the orig_node for which the resources have to be free'd - */ -static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) -{ - kfree(orig_node->bat_iv.bcast_own); - kfree(orig_node->bat_iv.bcast_own_sum); -} - -/** - * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node - * to include the new hard-interface - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * - * Return: 0 on success, a negative error code otherwise. - */ -static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, - unsigned int max_if_num) -{ - void *data_ptr; - size_t old_size; - int ret = -ENOMEM; - - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc_array(max_if_num, - BATADV_NUM_WORDS * sizeof(unsigned long), - GFP_ATOMIC); - if (!data_ptr) - goto unlock; - - memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size); - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; - - data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) - goto unlock; - - memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - (max_if_num - 1) * sizeof(u8)); - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; - - ret = 0; - -unlock: - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - return ret; -} - -/** - * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - */ -static void -batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - size_t chunk_size; - size_t if_offset; - void *data_ptr; - - lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); - - chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC); - if (!data_ptr) - /* use old buffer when new one could not be allocated */ - data_ptr = orig_node->bat_iv.bcast_own; - - /* copy first part */ - memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); - - /* copy second part */ - if_offset = (del_if_num + 1) * chunk_size; - memmove((char *)data_ptr + del_if_num * chunk_size, - (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, - (max_if_num - del_if_num) * chunk_size); - - /* bcast_own was shrunk down in new buffer; free old one */ - if (orig_node->bat_iv.bcast_own != data_ptr) { - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; - } -} - -/** - * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - */ -static void -batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - size_t if_offset; - void *data_ptr; - - lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); - - data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) - /* use old buffer when new one could not be allocated */ - data_ptr = orig_node->bat_iv.bcast_own_sum; - - memmove(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(u8)); - - if_offset = (del_if_num + 1) * sizeof(u8); - memmove((char *)data_ptr + del_if_num * sizeof(u8), - orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(u8)); - - /* bcast_own_sum was shrunk down in new buffer; free old one */ - if (orig_node->bat_iv.bcast_own_sum != data_ptr) { - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; - } -} - -/** - * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node - * to exclude the removed interface - * @orig_node: the orig_node that has to be changed - * @max_if_num: the current amount of interfaces - * @del_if_num: the index of the interface being removed - * - * Return: 0 on success, a negative error code otherwise. - */ -static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - unsigned int max_if_num, - unsigned int del_if_num) -{ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - if (max_if_num == 0) { - kfree(orig_node->bat_iv.bcast_own); - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own = NULL; - orig_node->bat_iv.bcast_own_sum = NULL; - } else { - batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num, - del_if_num); - batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num, - del_if_num); - } - - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - - return 0; -} - /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator @@ -315,7 +152,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; - size_t size; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) @@ -327,16 +163,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); - size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS; - orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC); - if (!orig_node->bat_iv.bcast_own) - goto free_orig_node; - - size = bat_priv->num_ifaces * sizeof(u8); - orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); - if (!orig_node->bat_iv.bcast_own_sum) - goto free_orig_node; - kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, @@ -347,8 +173,9 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) return orig_node; free_orig_node_hash: + /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); -free_orig_node: + /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; @@ -893,26 +720,30 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; + struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; - size_t word_index; u8 *w; - unsigned int if_num; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - word_index = hard_iface->if_num * BATADV_NUM_WORDS; - word = &orig_node->bat_iv.bcast_own[word_index]; - - batadv_bit_get_packet(bat_priv, word, 1, 0); - if_num = hard_iface->if_num; - w = &orig_node->bat_iv.bcast_own_sum[if_num]; - *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + hlist_for_each_entry_rcu(orig_ifinfo, + &orig_node->ifinfo_list, + list) { + if (orig_ifinfo->if_outgoing != hard_iface) + continue; + + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + word = orig_ifinfo->bat_iv.bcast_own; + batadv_bit_get_packet(bat_priv, word, 1, 0); + w = &orig_ifinfo->bat_iv.bcast_own_sum; + *w = bitmap_weight(word, + BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + } } rcu_read_unlock(); } @@ -999,6 +830,35 @@ out: batadv_hardif_put(primary_if); } +/** + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * @orig_node: originator which reproadcasted the OGMs directly + * @if_outgoing: interface which transmitted the original OGM and received the + * direct rebroadcast + * + * Return: Number of replied (rebroadcasted) OGMs which were transmitted by + * an originator and directly (without intermediate hop) received by a specific + * interface + */ +static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + u8 sum; + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); + if (!orig_ifinfo) + return 0; + + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + sum = orig_ifinfo->bat_iv.bcast_own_sum; + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + batadv_orig_ifinfo_put(orig_ifinfo); + + return sum; +} + /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator @@ -1026,8 +886,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; - struct batadv_orig_node *orig_node_tmp; - unsigned int if_num; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; @@ -1132,18 +990,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - orig_node_tmp = router->orig_node; - spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - if_num = router->if_incoming->if_num; - sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num]; - spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - - orig_node_tmp = neigh_node->orig_node; - spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - if_num = neigh_node->if_incoming->if_num; - sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num]; - spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); - + sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, + router->if_incoming); + sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, + neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } @@ -1186,7 +1036,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - unsigned int if_num; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; unsigned int tq_iface_penalty; @@ -1227,9 +1076,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - if_num = if_incoming->if_num; - orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; + orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; @@ -1237,7 +1084,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) @@ -1621,6 +1467,49 @@ out: consume_skb(skb_priv); } +/** + * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it + * @ogm_packet: rebroadcast OGM packet to process + * @if_incoming: the interface where this packet was received + * @orig_node: originator which reproadcasted the OGMs + * @if_incoming_seqno: OGM sequence number when rebroadcast was received + */ +static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, + struct batadv_hard_iface *if_incoming, + struct batadv_orig_node *orig_node, + u32 if_incoming_seqno) +{ + struct batadv_orig_ifinfo *orig_ifinfo; + s32 bit_pos; + u8 *weight; + + /* neighbor has to indicate direct link and it has to + * come via the corresponding interface + */ + if (!(ogm_packet->flags & BATADV_DIRECTLINK)) + return; + + if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, + ogm_packet->orig)) + return; + + orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); + if (!orig_ifinfo) + return; + + /* save packet seqno for bidirectional check */ + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + bit_pos = if_incoming_seqno - 2; + bit_pos -= ntohl(ogm_packet->seqno); + batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); + weight = &orig_ifinfo->bat_iv.bcast_own_sum; + *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, + BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + batadv_orig_ifinfo_put(orig_ifinfo); +} + /** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM @@ -1705,37 +1594,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, } if (is_my_orig) { - unsigned long *word; - size_t offset; - s32 bit_pos; - unsigned int if_num; - u8 *weight; - orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; - /* neighbor has to indicate direct link and it has to - * come via the corresponding interface - * save packet seqno for bidirectional check - */ - if (has_directlink_flag && - batadv_compare_eth(if_incoming->net_dev->dev_addr, - ogm_packet->orig)) { - if_num = if_incoming->if_num; - offset = if_num * BATADV_NUM_WORDS; - - spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - word = &orig_neigh_node->bat_iv.bcast_own[offset]; - bit_pos = if_incoming_seqno - 2; - bit_pos -= ntohl(ogm_packet->seqno); - batadv_set_bit(word, bit_pos); - weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num]; - *weight = bitmap_weight(word, - BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); - } + batadv_iv_ogm_process_reply(ogm_packet, if_incoming, + orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); @@ -2844,9 +2709,6 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .print = batadv_iv_ogm_orig_print, #endif .dump = batadv_iv_ogm_orig_dump, - .free = batadv_iv_ogm_orig_free, - .add_if = batadv_iv_ogm_orig_add_if, - .del_if = batadv_iv_ogm_orig_del_if, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 3cb82378300b..8b608a2e2653 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -47,8 +47,24 @@ static struct dentry *batadv_debugfs; +/** + * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access + * @file: file which was accessed + * @alt: explanation what can be used as alternative + */ +void batadv_debugfs_deprecated(struct file *file, const char *alt) +{ + struct dentry *dentry = file_dentry(file); + const char *name = dentry->d_name.name; + + pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s", + current->comm, task_pid_nr(current), name, alt); +} + static int batadv_algorithms_open(struct inode *inode, struct file *file) { + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n"); return single_open(file, batadv_algo_seq_print_text, NULL); } @@ -56,6 +72,8 @@ static int neighbors_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n"); return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev); } @@ -63,6 +81,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n"); return single_open(file, batadv_orig_seq_print_text, net_dev); } @@ -79,6 +99,8 @@ static int batadv_originators_hardif_open(struct inode *inode, { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_HARDIFS instead\n"); return single_open(file, batadv_orig_hardif_seq_print_text, net_dev); } @@ -86,6 +108,8 @@ static int batadv_gateways_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_GATEWAYS instead\n"); return single_open(file, batadv_gw_client_seq_print_text, net_dev); } @@ -93,6 +117,8 @@ static int batadv_transtable_global_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n"); return single_open(file, batadv_tt_global_seq_print_text, net_dev); } @@ -101,6 +127,8 @@ static int batadv_bla_claim_table_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n"); return single_open(file, batadv_bla_claim_table_seq_print_text, net_dev); } @@ -110,6 +138,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode, { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n"); return single_open(file, batadv_bla_backbone_table_seq_print_text, net_dev); } @@ -128,6 +158,8 @@ static int batadv_dat_cache_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n"); return single_open(file, batadv_dat_cache_seq_print_text, net_dev); } #endif @@ -136,6 +168,8 @@ static int batadv_transtable_local_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n"); return single_open(file, batadv_tt_local_seq_print_text, net_dev); } @@ -149,6 +183,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, ""); return single_open(file, batadv_nc_nodes_seq_print_text, net_dev); } #endif @@ -165,6 +200,8 @@ static int batadv_mcast_flags_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; + batadv_debugfs_deprecated(file, + "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n"); return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); } #endif diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 08a592ffbee5..8de018e5c577 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -21,12 +21,14 @@ #include "main.h" +struct file; struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" #if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) +void batadv_debugfs_deprecated(struct file *file, const char *alt); void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); @@ -38,6 +40,10 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #else +static inline void batadv_debugfs_deprecated(struct file *file, const char *alt) +{ +} + static inline void batadv_debugfs_init(void) { } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 2f0d42f2f913..781c5b6e6e8e 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -763,11 +763,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); - if (bat_priv->num_ifaces >= UINT_MAX) { - ret = -ENOSPC; - goto err_dev; - } - ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) @@ -777,16 +772,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (ret < 0) goto err_upper; - hard_iface->if_num = bat_priv->num_ifaces; - bat_priv->num_ifaces++; hard_iface->if_status = BATADV_IF_INACTIVE; - ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces); - if (ret < 0) { - bat_priv->algo_ops->iface.disable(hard_iface); - bat_priv->num_ifaces--; - hard_iface->if_status = BATADV_IF_NOT_IN_USE; - goto err_upper; - } kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; @@ -833,6 +819,33 @@ err: return ret; } +/** + * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface + * @soft_iface: soft interface to check + * + * This function is only using RCU for locking - the result can therefore be + * off when another functions is modifying the list at the same time. The + * caller can use the rtnl_lock to make sure that the count is accurate. + * + * Return: number of connected/enslaved hard interfaces + */ +static size_t batadv_hardif_cnt(const struct net_device *soft_iface) +{ + struct batadv_hard_iface *hard_iface; + size_t count = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + count++; + } + rcu_read_unlock(); + + return count; +} + /** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed @@ -855,9 +868,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); - bat_priv->num_ifaces--; - batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces); - primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; @@ -881,7 +891,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ - if (bat_priv->num_ifaces == 0) { + if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) { batadv_gw_check_client_stop(bat_priv); if (autodel == BATADV_IF_CLEANUP_AUTO) @@ -917,7 +927,6 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (ret) goto free_if; - hard_iface->if_num = 0; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 55c358ad3331..d70f363c52ae 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -47,6 +47,7 @@ #include #include +#include "debugfs.h" #include "hard-interface.h" #include "log.h" #include "originator.h" @@ -74,6 +75,8 @@ static int batadv_socket_open(struct inode *inode, struct file *file) if (!try_module_get(THIS_MODULE)) return -EBUSY; + batadv_debugfs_deprecated(file, ""); + nonseekable_open(inode, file); socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL); diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 853773e45f79..6beb5f067810 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -40,6 +40,9 @@ #include #include +#include "debugfs.h" +#include "trace.h" + #define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; @@ -98,13 +101,19 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, */ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) { + struct va_format vaf; va_list args; - char tmp_log_buf[256]; va_start(args, fmt); - vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", - jiffies_to_msecs(jiffies), tmp_log_buf); + + vaf.fmt = fmt; + vaf.va = &args; + + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", + jiffies_to_msecs(jiffies), &vaf); + + trace_batadv_dbg(bat_priv, &vaf); + va_end(args); return 0; @@ -115,6 +124,9 @@ static int batadv_log_open(struct inode *inode, struct file *file) if (!try_module_get(THIS_MODULE)) return -EBUSY; + batadv_debugfs_deprecated(file, + "Use tracepoint batadv:batadv_dbg instead\n"); + nonseekable_open(inode, file); file->private_data = inode->i_private; return 0; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3ccc75ee719c..2002b70e18db 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.3" +#define BATADV_SOURCE_VERSION "2018.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 1d295da3e342..56a981af5c92 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -904,9 +904,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) batadv_frag_purge_orig(orig_node, NULL); - if (orig_node->bat_priv->algo_ops->orig.free) - orig_node->bat_priv->algo_ops->orig.free(orig_node); - kfree(orig_node->tt_buff); kfree(orig_node); } @@ -1555,107 +1552,3 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; } - -/** - * batadv_orig_hash_add_if() - Add interface to originators in orig_hash - * @hard_iface: hard interface to add (already slave of the soft interface) - * @max_if_num: new number of interfaces - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_algo_ops *bao = bat_priv->algo_ops; - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - u32 i; - int ret; - - /* resize all orig nodes because orig_node->bcast_own(_sum) depend on - * if_num - */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - ret = 0; - if (bao->orig.add_if) - ret = bao->orig.add_if(orig_node, max_if_num); - if (ret == -ENOMEM) - goto err; - } - rcu_read_unlock(); - } - - return 0; - -err: - rcu_read_unlock(); - return -ENOMEM; -} - -/** - * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash - * @hard_iface: hard interface to remove (still slave of the soft interface) - * @max_if_num: new number of interfaces - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_hard_iface *hard_iface_tmp; - struct batadv_orig_node *orig_node; - struct batadv_algo_ops *bao = bat_priv->algo_ops; - u32 i; - int ret; - - /* resize all orig nodes because orig_node->bcast_own(_sum) depend on - * if_num - */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - ret = 0; - if (bao->orig.del_if) - ret = bao->orig.del_if(orig_node, max_if_num, - hard_iface->if_num); - if (ret == -ENOMEM) - goto err; - } - rcu_read_unlock(); - } - - /* renumber remaining batman interfaces _inside_ of orig_hash_lock */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) { - if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE) - continue; - - if (hard_iface == hard_iface_tmp) - continue; - - if (hard_iface->soft_iface != hard_iface_tmp->soft_iface) - continue; - - if (hard_iface_tmp->if_num > hard_iface->if_num) - hard_iface_tmp->if_num--; - } - rcu_read_unlock(); - - hard_iface->if_num = -1; - return 0; - -err: - rcu_read_unlock(); - return -ENOMEM; -} diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3b3f59b881e1..a8b4c7b667ec 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -72,10 +72,6 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); -int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num); -int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, - unsigned int max_if_num); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 626ddca332db..5db5a0a4c959 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -844,7 +844,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; - bat_priv->num_ifaces = 0; batadv_nc_init_bat_priv(bat_priv); @@ -1062,6 +1061,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c new file mode 100644 index 000000000000..3d57f9981f25 --- /dev/null +++ b/net/batman-adv/trace.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: + * + * Sven Eckelmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h new file mode 100644 index 000000000000..3acda26a30ca --- /dev/null +++ b/net/batman-adv/trace.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: + * + * Sven Eckelmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _NET_BATMAN_ADV_TRACE_H_ + +#include "main.h" + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM batadv + +/* provide dummy function when tracing is disabled */ +#if !defined(CONFIG_BATMAN_ADV_TRACING) + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, ...) \ + static inline void trace_ ## name(proto) {} + +#endif /* CONFIG_BATMAN_ADV_TRACING */ + +#define BATADV_MAX_MSG_LEN 256 + +TRACE_EVENT(batadv_dbg, + + TP_PROTO(struct batadv_priv *bat_priv, + struct va_format *vaf), + + TP_ARGS(bat_priv, vaf), + + TP_STRUCT__entry( + __string(device, bat_priv->soft_iface->name) + __string(driver, KBUILD_MODNAME) + __dynamic_array(char, msg, BATADV_MAX_MSG_LEN) + ), + + TP_fast_assign( + __assign_str(device, bat_priv->soft_iface->name); + __assign_str(driver, KBUILD_MODNAME); + WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), + BATADV_MAX_MSG_LEN, + vaf->fmt, + *vaf->va) >= BATADV_MAX_MSG_LEN); + ), + + TP_printk( + "%s %s %s", + __get_str(driver), + __get_str(device), + __get_str(msg) + ) +); + +#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 343d304851a5..45b5592de816 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -167,9 +167,6 @@ struct batadv_hard_iface { /** @list: list node for batadv_hardif_list */ struct list_head list; - /** @if_num: identificator of the interface */ - unsigned int if_num; - /** @if_status: status of the interface for batman-adv */ char if_status; @@ -232,6 +229,20 @@ struct batadv_hard_iface { spinlock_t neigh_list_lock; }; +/** + * struct batadv_orig_ifinfo - B.A.T.M.A.N. IV private orig_ifinfo members + */ +struct batadv_orig_ifinfo_bat_iv { + /** + * @bcast_own: bitfield which counts the number of our OGMs this + * orig_node rebroadcasted "back" to us (relative to last_real_seqno) + */ + DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); + + /** @bcast_own_sum: sum of bcast_own */ + u8 bcast_own_sum; +}; + /** * struct batadv_orig_ifinfo - originator info per outgoing interface */ @@ -257,6 +268,9 @@ struct batadv_orig_ifinfo { /** @batman_seqno_reset: time when the batman seqno window was reset */ unsigned long batman_seqno_reset; + /** @bat_iv: B.A.T.M.A.N. IV private structure */ + struct batadv_orig_ifinfo_bat_iv bat_iv; + /** @refcount: number of contexts the object is used */ struct kref refcount; @@ -339,19 +353,10 @@ struct batadv_orig_node_vlan { */ struct batadv_orig_bat_iv { /** - * @bcast_own: set of bitfields (one per hard-interface) where each one - * counts the number of our OGMs this orig_node rebroadcasted "back" to - * us (relative to last_real_seqno). Every bitfield is - * BATADV_TQ_LOCAL_WINDOW_SIZE bits long. - */ - unsigned long *bcast_own; - - /** @bcast_own_sum: sum of bcast_own */ - u8 *bcast_own_sum; - - /** - * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, - * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count + * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own, + * &batadv_orig_ifinfo_bat_iv.bcast_own_sum, + * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and + * &batadv_neigh_ifinfo_bat_iv.real_packet_count */ spinlock_t ogm_cnt_lock; }; @@ -1597,9 +1602,6 @@ struct batadv_priv { /** @batman_queue_left: number of remaining OGM packet slots */ atomic_t batman_queue_left; - /** @num_ifaces: number of interfaces assigned to this mesh interface */ - unsigned int num_ifaces; - /** @mesh_obj: kobject for sysfs mesh subdirectory */ struct kobject *mesh_obj; @@ -2179,28 +2181,6 @@ struct batadv_algo_neigh_ops { * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) */ struct batadv_algo_orig_ops { - /** - * @free: free the resources allocated by the routing algorithm for an - * orig_node object (optional) - */ - void (*free)(struct batadv_orig_node *orig_node); - - /** - * @add_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to a new hard-interface being added into the mesh - * (optional) - */ - int (*add_if)(struct batadv_orig_node *orig_node, - unsigned int max_if_num); - - /** - * @del_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to an hard-interface being removed from the mesh - * (optional) - */ - int (*del_if)(struct batadv_orig_node *orig_node, - unsigned int max_if_num, unsigned int del_if_num); - #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** @print: print the originator table (optional) */ void (*print)(struct batadv_priv *priv, struct seq_file *seq, diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4e2576fc0c59..828e87fe8027 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -467,7 +467,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 51c2cf2d8923..58fc6333d412 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -63,7 +63,7 @@ static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *dat memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, total_len); l2cap_chan_send(chan, &msg, total_len); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 7b3965861013..43c284158f63 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -489,9 +489,6 @@ static int bnep_session(void *arg) add_wait_queue(sk_sleep(sk), &wait); while (1) { - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); - if (atomic_read(&s->terminate)) break; /* RX */ @@ -512,6 +509,10 @@ static int bnep_session(void *arg) break; netif_wake_queue(dev); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(sk_sleep(sk), &wait); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 00deacdcb51c..cfd83c5521ae 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -49,18 +49,17 @@ static int bnep_sock_release(struct socket *sock) return 0; } -static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct bnep_connlist_req cl; struct bnep_connadd_req ca; struct bnep_conndel_req cd; struct bnep_conninfo ci; struct socket *nsock; - void __user *argp = (void __user *)arg; __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE); int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case BNEPCONNADD: @@ -134,16 +133,22 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } +static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); if (cmd == BNEPGETCONNLIST) { struct bnep_connlist_req cl; + unsigned __user *p = argp; u32 uci; int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -153,13 +158,13 @@ static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = bnep_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } - return bnep_sock_ioctl(sock, cmd, arg); + return do_bnep_sock_ioctl(sock, cmd, argp); } #endif diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 7f26a5a19ff6..07cfa3249f83 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -288,9 +288,6 @@ static int cmtp_session(void *arg) add_wait_queue(sk_sleep(sk), &wait); while (1) { - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); - if (atomic_read(&session->terminate)) break; if (sk->sk_state != BT_CONNECTED) @@ -306,6 +303,10 @@ static int cmtp_session(void *arg) cmtp_process_transmit(session); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(sk_sleep(sk), &wait); @@ -431,9 +432,10 @@ int cmtp_del_connection(struct cmtp_conndel_req *req) /* Stop session thread */ atomic_inc(&session->terminate); - /* Ensure session->terminate is updated */ - smp_mb__after_atomic(); - + /* + * See the comment preceding the call to wait_woken() + * in cmtp_session(). + */ wake_up_interruptible(sk_sleep(session->sock->sk)); } else err = -ENOENT; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index e08f28fadd65..defdd4871919 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -63,17 +63,16 @@ static int cmtp_sock_release(struct socket *sock) return 0; } -static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct cmtp_connadd_req ca; struct cmtp_conndel_req cd; struct cmtp_connlist_req cl; struct cmtp_conninfo ci; struct socket *nsock; - void __user *argp = (void __user *)arg; int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case CMTPCONNADD: @@ -137,16 +136,22 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } +static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); if (cmd == CMTPGETCONNLIST) { struct cmtp_connlist_req cl; + u32 __user *p = argp; u32 uci; int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -156,13 +161,13 @@ static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = cmtp_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } - return cmtp_sock_ioctl(sock, cmd, arg); + return do_cmtp_sock_ioctl(sock, cmd, argp); } #endif diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 74b29c7d841c..7352fe85674b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2839,6 +2839,20 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, return NULL; } +struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( + struct list_head *bdaddr_list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_irk *b; + + list_for_each_entry(b, bdaddr_list, list) { + if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) + return b; + } + + return NULL; +} + void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; @@ -2871,6 +2885,35 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) return 0; } +int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type, u8 *peer_irk, u8 *local_irk) +{ + struct bdaddr_list_with_irk *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + if (hci_bdaddr_list_lookup(list, bdaddr, type)) + return -EEXIST; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, bdaddr); + entry->bdaddr_type = type; + + if (peer_irk) + memcpy(entry->peer_irk, peer_irk, 16); + + if (local_irk) + memcpy(entry->local_irk, local_irk, 16); + + list_add(&entry->list, list); + + return 0; +} + int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; @@ -2890,6 +2933,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) return 0; } +int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_irk *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) { + hci_bdaddr_list_clear(list); + return 0; + } + + entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); + if (!entry) + return -ENOENT; + + list_del(&entry->list); + kfree(entry); + + return 0; +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -3084,6 +3147,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; + hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; + hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f12555f23a49..ef9928d7b4fb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1454,6 +1454,45 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_add_to_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_add_to_resolv_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST); + if (!sent) + return; + + hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr, + sent->bdaddr_type, sent->peer_irk, + sent->local_irk); +} + +static void hci_cc_le_del_from_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_del_from_resolv_list *sent; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST); + if (!sent) + return; + + hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr, + sent->bdaddr_type); +} + static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3279,6 +3318,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_ADD_TO_RESOLV_LIST: + hci_cc_le_add_to_resolv_list(hdev, skb); + break; + + case HCI_OP_LE_DEL_FROM_RESOLV_LIST: + hci_cc_le_del_from_resolv_list(hdev, skb); + break; + case HCI_OP_LE_CLEAR_RESOLV_LIST: hci_cc_le_clear_resolv_list(hdev, skb); break; @@ -4890,31 +4937,27 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - if (!status) { - /* The remote features procedure is defined for master - * role only. So only in case of an initiated connection - * request the remote features. - * - * If the local controller supports slave-initiated features - * exchange, then requesting the remote features in slave - * role is possible. Otherwise just transition into the - * connected state without requesting the remote features. - */ - if (conn->out || - (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { - struct hci_cp_le_read_remote_features cp; + /* The remote features procedure is defined for master + * role only. So only in case of an initiated connection + * request the remote features. + * + * If the local controller supports slave-initiated features + * exchange, then requesting the remote features in slave + * role is possible. Otherwise just transition into the + * connected state without requesting the remote features. + */ + if (conn->out || + (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { + struct hci_cp_le_read_remote_features cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = __cpu_to_le16(conn->handle); - hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, - sizeof(cp), &cp); + hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, + sizeof(cp), &cp); - hci_conn_hold(conn); - } else { - conn->state = BT_CONNECTED; - hci_connect_cfm(conn, status); - } + hci_conn_hold(conn); } else { + conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); } diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 253975cce943..a442e21f3894 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -649,7 +649,7 @@ static void hidp_process_transmit(struct hidp_session *session, } static int hidp_setup_input(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { struct input_dev *input; int i; @@ -748,7 +748,7 @@ EXPORT_SYMBOL_GPL(hidp_hid_driver); /* This function sets up the hid device. It does not add it to the HID system. That is done in hidp_add_connection(). */ static int hidp_setup_hid(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { struct hid_device *hid; int err; @@ -807,7 +807,7 @@ fault: /* initialize session devices */ static int hidp_session_dev_init(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { int ret; @@ -906,7 +906,7 @@ static void hidp_session_dev_work(struct work_struct *work) static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, struct socket *ctrl_sock, struct socket *intr_sock, - struct hidp_connadd_req *req, + const struct hidp_connadd_req *req, struct l2cap_conn *conn) { struct hidp_session *session; @@ -1074,6 +1074,10 @@ static int hidp_session_start_sync(struct hidp_session *session) static void hidp_session_terminate(struct hidp_session *session) { atomic_inc(&session->terminate); + /* + * See the comment preceding the call to wait_woken() + * in hidp_session_run(). + */ wake_up_interruptible(&hidp_session_wq); } @@ -1193,8 +1197,6 @@ static void hidp_session_run(struct hidp_session *session) * thread is woken up by ->sk_state_changed(). */ - /* Ensure session->terminate is updated */ - smp_mb__before_atomic(); if (atomic_read(&session->terminate)) break; @@ -1228,14 +1230,15 @@ static void hidp_session_run(struct hidp_session *session) hidp_process_transmit(session, &session->ctrl_transmit, session->ctrl_sock); + /* + * wait_woken() performs the necessary memory barriers + * for us; see the header comment for this primitive. + */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&hidp_session_wq, &wait); atomic_inc(&session->terminate); - - /* Ensure session->terminate is updated */ - smp_mb__after_atomic(); } static int hidp_session_wake_function(wait_queue_entry_t *wait, @@ -1335,7 +1338,7 @@ static int hidp_verify_sockets(struct socket *ctrl_sock, return 0; } -int hidp_connection_add(struct hidp_connadd_req *req, +int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) { diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index 8798492a6e99..6ef88d0a1919 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -122,7 +122,7 @@ struct hidp_connlist_req { struct hidp_conninfo __user *ci; }; -int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); +int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); int hidp_connection_del(struct hidp_conndel_req *req); int hidp_get_connlist(struct hidp_connlist_req *req); int hidp_get_conninfo(struct hidp_conninfo *ci); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 1eaac01f85de..9f85a1943be9 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -46,9 +46,8 @@ static int hidp_sock_release(struct socket *sock) return 0; } -static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { - void __user *argp = (void __user *) arg; struct hidp_connadd_req ca; struct hidp_conndel_req cd; struct hidp_connlist_req cl; @@ -57,7 +56,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long struct socket *isock; int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case HIDPCONNADD: @@ -122,6 +121,11 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } +static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT struct compat_hidp_connadd_req { int ctrl_sock; /* Connected control socket */ @@ -141,13 +145,15 @@ struct compat_hidp_connadd_req { static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); + int err; + if (cmd == HIDPGETCONNLIST) { struct hidp_connlist_req cl; + u32 __user *p = argp; u32 uci; - int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -157,39 +163,54 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = hidp_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } else if (cmd == HIDPCONNADD) { - struct compat_hidp_connadd_req ca; - struct hidp_connadd_req __user *uca; + struct compat_hidp_connadd_req ca32; + struct hidp_connadd_req ca; + struct socket *csock; + struct socket *isock; - uca = compat_alloc_user_space(sizeof(*uca)); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; - if (copy_from_user(&ca, (void __user *) arg, sizeof(ca))) + if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32))) return -EFAULT; - if (put_user(ca.ctrl_sock, &uca->ctrl_sock) || - put_user(ca.intr_sock, &uca->intr_sock) || - put_user(ca.parser, &uca->parser) || - put_user(ca.rd_size, &uca->rd_size) || - put_user(compat_ptr(ca.rd_data), &uca->rd_data) || - put_user(ca.country, &uca->country) || - put_user(ca.subclass, &uca->subclass) || - put_user(ca.vendor, &uca->vendor) || - put_user(ca.product, &uca->product) || - put_user(ca.version, &uca->version) || - put_user(ca.flags, &uca->flags) || - put_user(ca.idle_to, &uca->idle_to) || - copy_to_user(&uca->name[0], &ca.name[0], 128)) - return -EFAULT; + ca.ctrl_sock = ca32.ctrl_sock; + ca.intr_sock = ca32.intr_sock; + ca.parser = ca32.parser; + ca.rd_size = ca32.rd_size; + ca.rd_data = compat_ptr(ca32.rd_data); + ca.country = ca32.country; + ca.subclass = ca32.subclass; + ca.vendor = ca32.vendor; + ca.product = ca32.product; + ca.version = ca32.version; + ca.flags = ca32.flags; + ca.idle_to = ca32.idle_to; + memcpy(ca.name, ca32.name, 128); + + csock = sockfd_lookup(ca.ctrl_sock, &err); + if (!csock) + return err; - arg = (unsigned long) uca; + isock = sockfd_lookup(ca.intr_sock, &err); + if (!isock) { + sockfd_put(csock); + return err; + } - /* Fall through. We don't actually write back any _changes_ - to the structure anyway, so there's no need to copy back - into the original compat version */ + err = hidp_connection_add(&ca, csock, isock); + if (!err && copy_to_user(argp, &ca32, sizeof(ca32))) + err = -EFAULT; + + sockfd_put(csock); + sockfd_put(isock); + + return err; } return hidp_sock_ioctl(sock, cmd, arg); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index d17a4736e47c..2146e0f3b6f8 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -51,9 +51,6 @@ static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); -static u16 le_max_credits = L2CAP_LE_MAX_CREDITS; -static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS; - static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, @@ -519,8 +516,10 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan) chan->sdu_last_frag = NULL; chan->sdu_len = 0; chan->tx_credits = 0; - chan->rx_credits = le_max_credits; - chan->mps = min_t(u16, chan->imtu, le_default_mps); + /* Derive MPS from connection MTU to stop HCI fragmentation */ + chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); + /* Give enough credits for a full packet */ + chan->rx_credits = (chan->imtu / chan->mps) + 1; skb_queue_head_init(&chan->tx_q); } @@ -681,9 +680,9 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) u16 result; if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) - result = L2CAP_CR_AUTHORIZATION; + result = L2CAP_CR_LE_AUTHORIZATION; else - result = L2CAP_CR_BAD_PSM; + result = L2CAP_CR_LE_BAD_PSM; l2cap_state_change(chan, BT_DISCONN); @@ -1282,6 +1281,8 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; + l2cap_le_flowctl_init(chan); + req.psm = chan->psm; req.scid = cpu_to_le16(chan->scid); req.mtu = cpu_to_le16(chan->imtu); @@ -3669,7 +3670,7 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan) rsp.mtu = cpu_to_le16(chan->imtu); rsp.mps = cpu_to_le16(chan->mps); rsp.credits = cpu_to_le16(chan->rx_credits); - rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS); l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), &rsp); @@ -3815,9 +3816,17 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, result = L2CAP_CR_NO_MEM; + /* Check for valid dynamic CID range (as per Erratum 3253) */ + if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) { + result = L2CAP_CR_INVALID_SCID; + goto response; + } + /* Check if we already have channel with that dcid */ - if (__l2cap_get_chan_by_dcid(conn, scid)) + if (__l2cap_get_chan_by_dcid(conn, scid)) { + result = L2CAP_CR_SCID_IN_USE; goto response; + } chan = pchan->ops->new_connection(pchan); if (!chan) @@ -5279,7 +5288,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, credits = __le16_to_cpu(rsp->credits); result = __le16_to_cpu(rsp->result); - if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 || + if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 || dcid < L2CAP_CID_DYN_START || dcid > L2CAP_CID_LE_DYN_END)) return -EPROTO; @@ -5300,7 +5309,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_lock(chan); switch (result) { - case L2CAP_CR_SUCCESS: + case L2CAP_CR_LE_SUCCESS: if (__l2cap_get_chan_by_dcid(conn, dcid)) { err = -EBADSLT; break; @@ -5314,8 +5323,8 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_ready(chan); break; - case L2CAP_CR_AUTHENTICATION: - case L2CAP_CR_ENCRYPTION: + case L2CAP_CR_LE_AUTHENTICATION: + case L2CAP_CR_LE_ENCRYPTION: /* If we already have MITM protection we can't do * anything. */ @@ -5458,7 +5467,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, &conn->hcon->dst, LE_LINK); if (!pchan) { - result = L2CAP_CR_BAD_PSM; + result = L2CAP_CR_LE_BAD_PSM; chan = NULL; goto response; } @@ -5468,33 +5477,31 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_AUTHENTICATION; + result = L2CAP_CR_LE_AUTHENTICATION; chan = NULL; goto response_unlock; } /* Check for valid dynamic CID range */ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { - result = L2CAP_CR_INVALID_SCID; + result = L2CAP_CR_LE_INVALID_SCID; chan = NULL; goto response_unlock; } /* Check if we already have channel with that dcid */ if (__l2cap_get_chan_by_dcid(conn, scid)) { - result = L2CAP_CR_SCID_IN_USE; + result = L2CAP_CR_LE_SCID_IN_USE; chan = NULL; goto response_unlock; } chan = pchan->ops->new_connection(pchan); if (!chan) { - result = L2CAP_CR_NO_MEM; + result = L2CAP_CR_LE_NO_MEM; goto response_unlock; } - l2cap_le_flowctl_init(chan); - bacpy(&chan->src, &conn->hcon->src); bacpy(&chan->dst, &conn->hcon->dst); chan->src_type = bdaddr_src_type(conn->hcon); @@ -5506,6 +5513,9 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, chan->tx_credits = __le16_to_cpu(req->credits); __l2cap_chan_add(conn, chan); + + l2cap_le_flowctl_init(chan); + dcid = chan->scid; credits = chan->rx_credits; @@ -5524,7 +5534,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, chan->ops->defer(chan); } else { l2cap_chan_ready(chan); - result = L2CAP_CR_SUCCESS; + result = L2CAP_CR_LE_SUCCESS; } response_unlock: @@ -6699,13 +6709,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) struct l2cap_le_credits pkt; u16 return_credits; - /* We return more credits to the sender only after the amount of - * credits falls below half of the initial amount. - */ - if (chan->rx_credits >= (le_max_credits + 1) / 2) - return; + return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits; - return_credits = le_max_credits - chan->rx_credits; + if (!return_credits) + return; BT_DBG("chan %p returning %u credits to sender", chan, return_credits); @@ -6719,6 +6726,21 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); } +static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; + + BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len); + + /* Wait recv to confirm reception before updating the credits */ + err = chan->ops->recv(chan, skb); + + /* Update credits whenever an SDU is received */ + l2cap_chan_le_send_credits(chan); + + return err; +} + static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) { int err; @@ -6737,7 +6759,11 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) chan->rx_credits--; BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); - l2cap_chan_le_send_credits(chan); + /* Update if remote had run out of credits, this should only happens + * if the remote is not using the entire MPS. + */ + if (!chan->rx_credits) + l2cap_chan_le_send_credits(chan); err = 0; @@ -6763,12 +6789,22 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) } if (skb->len == sdu_len) - return chan->ops->recv(chan, skb); + return l2cap_le_recv(chan, skb); chan->sdu = skb; chan->sdu_len = sdu_len; chan->sdu_last_frag = skb; + /* Detect if remote is not able to use the selected MPS */ + if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) { + u16 mps_len = skb->len + L2CAP_SDULEN_SIZE; + + /* Adjust the number of credits */ + BT_DBG("chan->mps %u -> %u", chan->mps, mps_len); + chan->mps = mps_len; + l2cap_chan_le_send_credits(chan); + } + return 0; } @@ -6785,7 +6821,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) skb = NULL; if (chan->sdu->len == chan->sdu_len) { - err = chan->ops->recv(chan, chan->sdu); + err = l2cap_le_recv(chan, chan->sdu); if (!err) { chan->sdu = NULL; chan->sdu_last_frag = NULL; @@ -7102,7 +7138,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, case L2CAP_MODE_BASIC: break; case L2CAP_MODE_LE_FLOWCTL: - l2cap_le_flowctl_init(chan); break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: @@ -7645,11 +7680,6 @@ int __init l2cap_init(void) l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs, NULL, &l2cap_debugfs_fops); - debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs, - &le_max_credits); - debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs, - &le_default_mps); - return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3bdc8f3ca259..ccce954f8146 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2434,9 +2434,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* LE address type */ addr_type = le_addr_type(cp->addr.type); - hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); - - err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); + /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */ + err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, @@ -2450,8 +2449,6 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto done; } - /* Abort any ongoing SMP pairing */ - smp_cancel_pairing(conn); /* Defer clearing up the connection parameters until closing to * give a chance of keeping them if a repairing happens. diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 5e44d842cc5d..0c7d31c6c18c 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -839,18 +839,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l BT_DBG("TIOCMIWAIT"); break; - case TIOCGSERIAL: - BT_ERR("TIOCGSERIAL is not supported"); - return -ENOIOCTLCMD; - - case TIOCSSERIAL: - BT_ERR("TIOCSSERIAL is not supported"); - return -ENOIOCTLCMD; - - case TIOCSERGSTRUCT: - BT_ERR("TIOCSERGSTRUCT is not supported"); - return -ENOIOCTLCMD; - case TIOCSERGETLSR: BT_ERR("TIOCSERGETLSR is not supported"); return -ENOIOCTLCMD; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a7b0773536b..c822e626761b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -88,9 +88,6 @@ struct smp_dev { u8 local_rand[16]; bool debug_key; - u8 min_key_size; - u8 max_key_size; - struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; @@ -625,7 +622,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len); + iov_iter_kvec(&msg.msg_iter, WRITE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); @@ -720,7 +717,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; - req->max_key_size = SMP_DEV(hdev)->max_key_size; + req->max_key_size = hdev->le_max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -731,7 +728,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; - rsp->max_key_size = SMP_DEV(hdev)->max_key_size; + rsp->max_key_size = hdev->le_max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -745,7 +742,7 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; - if (max_key_size > SMP_DEV(hdev)->max_key_size || + if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; @@ -2422,30 +2419,51 @@ unlock: return ret; } -void smp_cancel_pairing(struct hci_conn *hcon) +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct hci_conn *hcon; + struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; + int err; + + err = hci_remove_ltk(hdev, bdaddr, addr_type); + hci_remove_irk(hdev, bdaddr, addr_type); + hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); + if (!hcon) + goto done; + + conn = hcon->l2cap_data; if (!conn) - return; + goto done; chan = conn->smp; if (!chan) - return; + goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { + /* Set keys to NULL to make sure smp_failure() does not try to + * remove and free already invalidated rcu list entries. */ + smp->ltk = NULL; + smp->slave_ltk = NULL; + smp->remote_irk = NULL; + if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); + err = 0; } l2cap_chan_unlock(chan); + +done: + return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) @@ -3243,8 +3261,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; - smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; - smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; create_chan: chan = l2cap_chan_create(); @@ -3370,7 +3386,7 @@ static ssize_t le_min_key_size_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[4]; - snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); + snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } @@ -3391,11 +3407,11 @@ static ssize_t le_min_key_size_write(struct file *file, sscanf(buf, "%hhu", &key_size); - if (key_size > SMP_DEV(hdev)->max_key_size || + if (key_size > hdev->le_max_key_size || key_size < SMP_MIN_ENC_KEY_SIZE) return -EINVAL; - SMP_DEV(hdev)->min_key_size = key_size; + hdev->le_min_key_size = key_size; return count; } @@ -3414,7 +3430,7 @@ static ssize_t le_max_key_size_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[4]; - snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); + snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } @@ -3436,10 +3452,10 @@ static ssize_t le_max_key_size_write(struct file *file, sscanf(buf, "%hhu", &key_size); if (key_size > SMP_MAX_ENC_KEY_SIZE || - key_size < SMP_DEV(hdev)->min_key_size) + key_size < hdev->le_min_key_size) return -EINVAL; - SMP_DEV(hdev)->max_key_size = key_size; + hdev->le_max_key_size = key_size; return count; } diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 0ff6247eaa6c..121edadd5f8d 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -181,7 +181,8 @@ enum smp_key_pref { }; /* SMP Commands */ -void smp_cancel_pairing(struct hci_conn *hcon); +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type); bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f4078830ea50..c89c22c49015 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,9 +10,11 @@ #include #include #include +#include +#include static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage) + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { u32 ret; @@ -28,13 +30,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; u32 ret = 0, i; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return PTR_ERR(storage); + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (!repeat) repeat = 1; @@ -53,7 +62,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return ret; } @@ -107,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; + struct sock *sk; void *data; int ret; @@ -129,11 +140,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } + sk = kzalloc(sizeof(struct sock), GFP_USER); + if (!sk) { + kfree(data); + return -ENOMEM; + } + sock_net_set(sk, current->nsproxy->net_ns); + sock_init_data(NULL, sk); + skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(sk); return -ENOMEM; } + skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); @@ -151,6 +172,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { kfree_skb(skb); + kfree(sk); return -ENOMEM; } } @@ -163,6 +185,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); kfree_skb(skb); + kfree(sk); return ret; } diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index f0fc182d3db7..7acfc83087d5 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -23,9 +23,11 @@ static void shutdown_umh(struct umh_info *info) if (!info->pid) return; - tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); - if (tsk) + tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) { force_sig(SIGKILL, tsk); + put_task_struct(tsk); + } fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; @@ -59,7 +61,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.is_set = is_set; req.pid = current->pid; req.cmd = optname; - req.addr = (long)optval; + req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); if (!info.pid) @@ -90,6 +92,7 @@ static int __init load_umh(void) int err; /* fork usermode process */ + info.cmdline = "bpfilter_umh"; err = fork_usermode_blob(&bpfilter_umh_start, &bpfilter_umh_end - &bpfilter_umh_start, &info); @@ -98,7 +101,7 @@ static int __init load_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", info.pid); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { stop_umh(); return -EFAULT; } diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index aa0d3b2f1bb7..3625d6ade45c 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -17,7 +17,7 @@ config BRIDGE other third party bridge products. In order to use the Ethernet bridge, you'll need the bridge - configuration tools; see + configuration tools; see for location. Please read the Bridge mini-HOWTO for more information. diff --git a/net/bridge/br.c b/net/bridge/br.c index b0a0b82e2d91..360ad66c21e9 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -151,7 +151,7 @@ static int br_switchdev_event(struct notifier_block *unused, break; } br_fdb_offloaded_set(br, p, fdb_info->addr, - fdb_info->vid); + fdb_info->vid, true); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; @@ -163,7 +163,7 @@ static int br_switchdev_event(struct notifier_block *unused, case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, - fdb_info->vid); + fdb_info->vid, fdb_info->offloaded); break; } @@ -175,6 +175,22 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) +{ + bool cur = !!br_opt_get(br, opt); + + br_debug(br, "toggle option: %d state: %d -> %d\n", + opt, cur, on); + + if (cur == on) + return; + + if (on) + set_bit(opt, &br->options); + else + clear_bit(opt, &br->options); +} + static void __net_exit br_net_exit(struct net *net) { struct net_device *dev; diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 2cf7716254be..6b78e6351719 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -39,7 +39,7 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) } } - br->neigh_suppress_enabled = neigh_suppress; + br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress); } #if IS_ENABLED(CONFIG_INET) @@ -155,7 +155,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, ipv4_is_multicast(tip)) return; - if (br->neigh_suppress_enabled) { + if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (p && (p->flags & BR_NEIGH_SUPPRESS)) return; if (ipv4_is_zeronet(sip) || sip == tip) { @@ -175,7 +175,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, return; } - if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) { + if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && + br_is_local_ip(vlandev, tip)) { /* its our local ip, so don't proxy reply * and don't forward to neigh suppress ports */ @@ -213,7 +214,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, /* If we have replied or as long as we know the * mac, indicate to arp replied */ - if (replied || br->neigh_suppress_enabled) + if (replied || + br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; } @@ -311,7 +313,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, /* Neighbor Advertisement */ memset(na, 0, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; - na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */ + na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; @@ -460,7 +462,8 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, * mac, indicate to NEIGH_SUPPRESS ports that we * have replied */ - if (replied || br->neigh_suppress_enabled) + if (replied || + br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; } neigh_release(n); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e682a668ce57..c6abf927f0c9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -67,11 +67,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (IS_ENABLED(CONFIG_INET) && (eth->h_proto == htons(ETH_P_ARP) || eth->h_proto == htons(ETH_P_RARP)) && - br->neigh_suppress_enabled) { + br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && - br->neigh_suppress_enabled && + br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { @@ -228,7 +228,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; /* this flag will be cleared if the MTU was automatically adjusted */ - br->mtu_set_by_user = true; + br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); @@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - __netpoll_free_async(np); + __netpoll_free(np); } #endif diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 502f66349530..e56ba3912a90 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -504,6 +504,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, fdb->added_by_user = 0; fdb->added_by_external_learn = 0; fdb->offloaded = 0; + fdb->is_sticky = 0; fdb->updated = fdb->used = jiffies; if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, @@ -584,7 +585,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst)) { + if (unlikely(source != fdb->dst && !fdb->is_sticky)) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ @@ -656,6 +657,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags |= NTF_OFFLOADED; if (fdb->added_by_external_learn) ndm->ndm_flags |= NTF_EXT_LEARNED; + if (fdb->is_sticky) + ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; @@ -772,8 +775,10 @@ skip: /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const __u8 *addr, __u16 state, __u16 flags, __u16 vid) + const u8 *addr, u16 state, u16 flags, u16 vid, + u8 ndm_flags) { + u8 is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -789,6 +794,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, return -EINVAL; } + if (is_sticky && (state & NUD_PERMANENT)) + return -EINVAL; + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -832,6 +840,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + + if (is_sticky != fdb->is_sticky) { + fdb->is_sticky = is_sticky; + modified = true; + } + fdb->added_by_user = 1; fdb->used = jiffies; @@ -865,7 +879,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm->ndm_state, - nlh_flags, vid); + nlh_flags, vid, ndm->ndm_flags); spin_unlock_bh(&br->hash_lock); } @@ -1138,7 +1152,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, } void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, bool offloaded) { struct net_bridge_fdb_entry *fdb; @@ -1146,7 +1160,7 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (fdb) - fdb->offloaded = 1; + fdb->offloaded = offloaded; spin_unlock_bh(&br->hash_lock); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 0363f1bdc401..9b46d2dc4c22 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -394,8 +394,7 @@ static int find_portno(struct net_bridge *br) struct net_bridge_port *p; unsigned long *inuse; - inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), - GFP_KERNEL); + inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; @@ -404,7 +403,7 @@ static int find_portno(struct net_bridge *br) set_bit(p->port_no, inuse); } index = find_first_zero_bit(inuse, BR_MAX_PORTS); - kfree(inuse); + bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } @@ -509,14 +508,14 @@ void br_mtu_auto_adjust(struct net_bridge *br) ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ - if (br->mtu_set_by_user) + if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); - br->mtu_set_by_user = false; + br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } static void br_set_gso_limits(struct net_bridge *br) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 72074276c088..3ddca11f44c2 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -122,7 +122,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && - br->neigh_suppress_enabled && + br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 6d9f48bd374a..a7ea2d431714 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -84,7 +84,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, int i, err = 0; int idx = 0, s_idx = cb->args[1]; - if (br->multicast_disabled) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; mdb = rcu_dereference(br->mdb); @@ -162,6 +162,29 @@ out: return err; } +static int br_mdb_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct br_port_msg *bpm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for mdb dump request"); + return -EINVAL; + } + + bpm = nlmsg_data(nlh); + if (bpm->ifindex) { + NL_SET_ERR_MSG_MOD(extack, "Filtering by device index is not supported for mdb dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(nlh, sizeof(*bpm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); + return -EINVAL; + } + + return 0; +} + static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; @@ -169,6 +192,13 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) struct nlmsghdr *nlh = NULL; int idx = 0, s_idx; + if (cb->strict_check) { + int err = br_mdb_valid_dump_req(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_idx = cb->args[0]; rcu_read_lock(); @@ -598,7 +628,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, struct net_bridge_port *p; int ret; - if (!netif_running(br->dev) || br->multicast_disabled) + if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; dev = __dev_get_by_index(net, entry->ifindex); @@ -673,7 +703,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) struct br_ip ip; int err = -EINVAL; - if (!netif_running(br->dev) || br->multicast_disabled) + if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; __mdb_entry_to_br_ip(entry, &ip); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 20ed7adcf1cc..6bac0d6b7b94 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -158,7 +158,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); struct br_ip ip; - if (br->multicast_disabled) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) @@ -411,7 +411,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; - iph->saddr = br->multicast_query_use_ifaddr ? + iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = htonl(INADDR_ALLHOSTS_GROUP); ((u8 *)&iph[1])[0] = IPOPT_RA; @@ -503,11 +503,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); - br->has_ipv6_addr = 0; + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false); return NULL; } - br->has_ipv6_addr = 1; + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); @@ -628,7 +628,7 @@ static struct net_bridge_mdb_entry *br_multicast_get_group( port ? port->dev->name : br->dev->name); err = -E2BIG; disable: - br->multicast_disabled = 1; + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); goto err; } } @@ -894,7 +894,7 @@ static void br_multicast_querier_expired(struct net_bridge *br, struct bridge_mcast_own_query *query) { spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || br->multicast_disabled) + if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) goto out; br_multicast_start_querier(br, query); @@ -965,8 +965,9 @@ static void br_multicast_send_query(struct net_bridge *br, struct br_ip br_group; unsigned long time; - if (!netif_running(br->dev) || br->multicast_disabled || - !br->multicast_querier) + if (!netif_running(br->dev) || + !br_opt_get(br, BROPT_MULTICAST_ENABLED) || + !br_opt_get(br, BROPT_MULTICAST_QUERIER)) return; memset(&br_group.u, 0, sizeof(br_group.u)); @@ -1036,7 +1037,7 @@ static void br_mc_disabled_update(struct net_device *dev, bool value) .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, .flags = SWITCHDEV_F_DEFER, - .u.mc_disabled = value, + .u.mc_disabled = !value, }; switchdev_port_attr_set(dev, &attr); @@ -1054,7 +1055,8 @@ int br_multicast_add_port(struct net_bridge_port *port) timer_setup(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif - br_mc_disabled_update(port->dev, port->br->multicast_disabled); + br_mc_disabled_update(port->dev, + br_opt_get(port->br, BROPT_MULTICAST_ENABLED)); port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) @@ -1091,7 +1093,7 @@ static void __br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; - if (br->multicast_disabled || !netif_running(br->dev)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) return; br_multicast_enable(&port->ip4_own_query); @@ -1420,7 +1422,14 @@ static void br_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + + /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules, + * the arrival port for IGMP Queries where the source address + * is 0.0.0.0 should not be added to router port list. + */ + if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) || + saddr->proto == htons(ETH_P_IPV6)) + br_multicast_mark_router(br, port); } static void br_ip4_multicast_query(struct net_bridge *br, @@ -1634,7 +1643,7 @@ br_multicast_leave_group(struct net_bridge *br, if (timer_pending(&other_query->timer)) goto out; - if (br->multicast_querier) { + if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { __br_multicast_send_query(br, port, &mp->addr); time = jiffies + br->multicast_last_member_count * @@ -1746,7 +1755,7 @@ static void br_multicast_err_count(const struct net_bridge *br, struct bridge_mcast_stats __percpu *stats; struct bridge_mcast_stats *pstats; - if (!br->multicast_stats_enabled) + if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) @@ -1904,7 +1913,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; - if (br->multicast_disabled) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; switch (skb->protocol) { @@ -1956,8 +1965,6 @@ void br_multicast_init(struct net_bridge *br) br->hash_max = 512; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - br->multicast_querier = 0; - br->multicast_query_use_ifaddr = 0; br->multicast_last_member_count = 2; br->multicast_startup_query_count = 2; @@ -1976,7 +1983,8 @@ void br_multicast_init(struct net_bridge *br) br->ip6_other_query.delay_time = 0; br->ip6_querier.port = NULL; #endif - br->has_ipv6_addr = 1; + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); timer_setup(&br->multicast_router_timer, @@ -1998,7 +2006,7 @@ static void __br_multicast_open(struct net_bridge *br, { query->startup_sent = 0; - if (br->multicast_disabled) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; mod_timer(&query->timer, jiffies); @@ -2173,12 +2181,12 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) int err = 0; spin_lock_bh(&br->multicast_lock); - if (br->multicast_disabled == !val) + if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) goto unlock; - br_mc_disabled_update(br->dev, !val); - br->multicast_disabled = !val; - if (br->multicast_disabled) + br_mc_disabled_update(br->dev, val); + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) goto unlock; if (!netif_running(br->dev)) @@ -2189,7 +2197,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) if (mdb->old) { err = -EEXIST; rollback: - br->multicast_disabled = !!val; + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); goto unlock; } @@ -2213,7 +2221,7 @@ bool br_multicast_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - return !br->multicast_disabled; + return !!br_opt_get(br, BROPT_MULTICAST_ENABLED); } EXPORT_SYMBOL_GPL(br_multicast_enabled); @@ -2236,10 +2244,10 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val) val = !!val; spin_lock_bh(&br->multicast_lock); - if (br->multicast_querier == val) + if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val) goto unlock; - br->multicast_querier = val; + br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val); if (!val) goto unlock; @@ -2560,7 +2568,7 @@ void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, struct bridge_mcast_stats __percpu *stats; /* if multicast_disabled is true then igmp type can't be set */ - if (!type || !br->multicast_stats_enabled) + if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..b1b5e8516724 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -487,14 +487,15 @@ static unsigned int br_nf_pre_routing(void *priv, br = p->br; if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && !br->nf_call_ip6tables) + if (!brnf_call_ip6tables && + !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br->nf_call_iptables) + if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) @@ -636,7 +637,7 @@ static unsigned int br_nf_forward_arp(void *priv, return NF_ACCEPT; br = p->br; - if (!brnf_call_arptables && !br->nf_call_arptables) + if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { @@ -835,7 +836,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ec2b58a09f76..3345f1984542 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1034,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1114,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { + __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); + + err = br_vlan_set_stats_per_port(br, per_port); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1139,7 +1148,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], spin_lock_bh(&br->lock); memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); spin_unlock_bh(&br->lock); - br->group_addr_set = true; + br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); } @@ -1167,7 +1176,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], u8 val; val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); - br->multicast_query_use_ifaddr = !!val; + br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); } if (data[IFLA_BR_MCAST_QUERIER]) { @@ -1244,7 +1253,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], __u8 mcast_stats; mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); - br->multicast_stats_enabled = !!mcast_stats; + br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats); } if (data[IFLA_BR_MCAST_IGMP_VERSION]) { @@ -1271,19 +1280,19 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_NF_CALL_IPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); - br->nf_call_iptables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); } if (data[IFLA_BR_NF_CALL_IP6TABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); - br->nf_call_ip6tables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); } if (data[IFLA_BR_NF_CALL_ARPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); - br->nf_call_arptables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); } #endif @@ -1327,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1416,17 +1426,22 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || - nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled)) + nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, + br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || + nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, + br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || - nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || + nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, + br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, - br->multicast_query_use_ifaddr) || - nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, + br_opt_get(br, BROPT_MULTICAST_QUERIER)) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, - br->multicast_stats_enabled) || + br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, br->hash_elasticity) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || @@ -1469,11 +1484,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, - br->nf_call_iptables ? 1 : 0) || + br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, - br->nf_call_ip6tables ? 1 : 0) || + br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, - br->nf_call_arptables ? 1 : 0)) + br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0)) return -EMSGSIZE; #endif diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 11ed2029985f..2920e06a5403 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -54,14 +54,12 @@ typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; -struct bridge_id -{ +struct bridge_id { unsigned char prio[2]; unsigned char addr[ETH_ALEN]; }; -struct mac_addr -{ +struct mac_addr { unsigned char addr[ETH_ALEN]; }; @@ -181,6 +179,7 @@ struct net_bridge_fdb_entry { struct hlist_node fdb_node; unsigned char is_local:1, is_static:1, + is_sticky:1, added_by_user:1, added_by_external_learn:1, offloaded:1; @@ -206,8 +205,7 @@ struct net_bridge_port_group { unsigned char eth_addr[ETH_ALEN]; }; -struct net_bridge_mdb_entry -{ +struct net_bridge_mdb_entry { struct hlist_node hlist[2]; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; @@ -217,8 +215,7 @@ struct net_bridge_mdb_entry bool host_joined; }; -struct net_bridge_mdb_htable -{ +struct net_bridge_mdb_htable { struct hlist_head *mhash; struct rcu_head rcu; struct net_bridge_mdb_htable *old; @@ -309,16 +306,32 @@ static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_devi rcu_dereference_rtnl(dev->rx_handler_data) : NULL; } +enum net_bridge_opts { + BROPT_VLAN_ENABLED, + BROPT_VLAN_STATS_ENABLED, + BROPT_NF_CALL_IPTABLES, + BROPT_NF_CALL_IP6TABLES, + BROPT_NF_CALL_ARPTABLES, + BROPT_GROUP_ADDR_SET, + BROPT_MULTICAST_ENABLED, + BROPT_MULTICAST_QUERIER, + BROPT_MULTICAST_QUERY_USE_IFADDR, + BROPT_MULTICAST_STATS_ENABLED, + BROPT_HAS_IPV6_ADDR, + BROPT_NEIGH_SUPPRESS_ENABLED, + BROPT_MTU_SET_BY_USER, + BROPT_VLAN_STATS_PER_PORT, +}; + struct net_bridge { spinlock_t lock; spinlock_t hash_lock; struct list_head port_list; struct net_device *dev; struct pcpu_sw_netstats __percpu *stats; + unsigned long options; /* These fields are accessed on each packet */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING - u8 vlan_enabled; - u8 vlan_stats_enabled; __be16 vlan_proto; u16 default_pvid; struct net_bridge_vlan_group __rcu *vlgrp; @@ -330,9 +343,6 @@ struct net_bridge { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; - bool nf_call_iptables; - bool nf_call_ip6tables; - bool nf_call_arptables; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; @@ -340,7 +350,6 @@ struct net_bridge { /* STP */ bridge_id designated_root; bridge_id bridge_id; - u32 root_path_cost; unsigned char topology_change; unsigned char topology_change_detected; u16 root_port; @@ -352,9 +361,9 @@ struct net_bridge { unsigned long bridge_hello_time; unsigned long bridge_forward_delay; unsigned long bridge_ageing_time; + u32 root_path_cost; u8 group_addr[ETH_ALEN]; - bool group_addr_set; enum { BR_NO_STP, /* no spanning tree */ @@ -363,13 +372,6 @@ struct net_bridge { } stp_enabled; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - unsigned char multicast_router; - - u8 multicast_disabled:1; - u8 multicast_querier:1; - u8 multicast_query_use_ifaddr:1; - u8 has_ipv6_addr:1; - u8 multicast_stats_enabled:1; u32 hash_elasticity; u32 hash_max; @@ -378,7 +380,11 @@ struct net_bridge { u32 multicast_startup_query_count; u8 multicast_igmp_version; - + u8 multicast_router; +#if IS_ENABLED(CONFIG_IPV6) + u8 multicast_mld_version; +#endif + spinlock_t multicast_lock; unsigned long multicast_last_member_interval; unsigned long multicast_membership_interval; unsigned long multicast_querier_interval; @@ -386,7 +392,6 @@ struct net_bridge { unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - spinlock_t multicast_lock; struct net_bridge_mdb_htable __rcu *mdb; struct hlist_head router_list; @@ -399,7 +404,6 @@ struct net_bridge { struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; - u8 multicast_mld_version; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif @@ -413,8 +417,6 @@ struct net_bridge { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif - bool neigh_suppress_enabled; - bool mtu_set_by_user; struct hlist_head fdb_list; }; @@ -492,6 +494,14 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) return true; } +static inline int br_opt_get(const struct net_bridge *br, + enum net_bridge_opts opt) +{ + return test_bit(opt, &br->options); +} + +void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); + /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); @@ -564,7 +574,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, bool offloaded); /* br_forward.c */ enum br_pkt_type { @@ -698,8 +708,8 @@ __br_multicast_querier_exists(struct net_bridge *br, { bool own_querier_enabled; - if (br->multicast_querier) { - if (is_ipv6 && !br->has_ipv6_addr) + if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { + if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; @@ -850,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index d77f807420c4..b993df770675 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -103,7 +103,7 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, static void br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, u16 vid, struct net_device *dev, - bool added_by_user) + bool added_by_user, bool offloaded) { struct switchdev_notifier_fdb_info info; unsigned long notifier_type; @@ -111,6 +111,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, info.addr = mac; info.vid = vid; info.added_by_user = added_by_user; + info.offloaded = offloaded; notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; call_switchdev_notifiers(notifier_type, dev, &info.info); } @@ -126,13 +127,15 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user); + fdb->added_by_user, + fdb->offloaded); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, fdb->dst->dev, - fdb->added_by_user); + fdb->added_by_user, + fdb->offloaded); break; } } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 0318a69888d4..60182bef6341 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -303,7 +303,7 @@ static ssize_t group_addr_store(struct device *d, ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); - br->group_addr_set = true; + br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); @@ -349,7 +349,7 @@ static ssize_t multicast_snooping_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", !br->multicast_disabled); + return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, @@ -365,12 +365,13 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr); + return sprintf(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) { - br->multicast_query_use_ifaddr = !!val; + br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } @@ -388,7 +389,7 @@ static ssize_t multicast_querier_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_querier); + return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER)); } static ssize_t multicast_querier_store(struct device *d, @@ -636,12 +637,13 @@ static ssize_t multicast_stats_enabled_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_stats_enabled); + return sprintf(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val) { - br->multicast_stats_enabled = !!val; + br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } @@ -678,12 +680,12 @@ static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->nf_call_iptables); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) { - br->nf_call_iptables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } @@ -699,12 +701,12 @@ static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->nf_call_ip6tables); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) { - br->nf_call_ip6tables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } @@ -720,12 +722,12 @@ static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->nf_call_arptables); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) { - br->nf_call_arptables = val ? true : false; + br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } @@ -743,7 +745,7 @@ static ssize_t vlan_filtering_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->vlan_enabled); + return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, @@ -791,7 +793,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->vlan_stats_enabled); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static ssize_t vlan_stats_enabled_store(struct device *d, @@ -801,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_stats); } static DEVICE_ATTR_RW(vlan_stats_enabled); + +static ssize_t vlan_stats_per_port_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); +} + +static ssize_t vlan_stats_per_port_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port); +} +static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { @@ -854,6 +872,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, + &dev_attr_vlan_stats_per_port.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 7df269092103..8c9297a01947 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) } } +static void nbp_vlan_rcu_free(struct rcu_head *rcu) +{ + struct net_bridge_vlan *v; + + v = container_of(rcu, struct net_bridge_vlan, rcu); + WARN_ON(br_vlan_is_master(v)); + /* if we had per-port stats configured then free them here */ + if (v->brvlan->stats != v->stats) + free_percpu(v->stats); + v->stats = NULL; + kfree(v); +} + /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: @@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (!masterv) goto out_filt; v->brvlan = masterv; - v->stats = masterv->stats; + if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { + v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + if (!v->stats) { + err = -ENOMEM; + goto out_filt; + } + } else { + v->stats = masterv->stats; + } } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags); if (err && err != -EOPNOTSUPP) @@ -282,6 +303,10 @@ out_filt: if (p) { __vlan_vid_del(dev, br, v->vid); if (masterv) { + if (v->stats && masterv->stats != v->stats) + free_percpu(v->stats); + v->stats = NULL; + br_vlan_put_master(masterv); v->brvlan = NULL; } @@ -329,7 +354,7 @@ static int __vlan_del(struct net_bridge_vlan *v) rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); - kfree_rcu(v, rcu); + call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); @@ -386,7 +411,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, return NULL; } } - if (br->vlan_stats_enabled) { + if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); stats->tx_bytes += skb->len; @@ -475,14 +500,14 @@ static bool __allowed_ingress(const struct net_bridge *br, skb->vlan_tci |= pvid; /* if stats are disabled we can avoid the lookup */ - if (!br->vlan_stats_enabled) + if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) return true; } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; - if (br->vlan_stats_enabled) { + if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); stats->rx_bytes += skb->len; @@ -504,7 +529,7 @@ bool br_allowed_ingress(const struct net_bridge *br, /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ - if (!br->vlan_enabled) { + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } @@ -538,7 +563,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) struct net_bridge *br = p->br; /* If filtering was disabled at input, let it pass. */ - if (!br->vlan_enabled) + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); @@ -695,11 +720,12 @@ struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { - if (br->group_addr_set) + if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); - if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) { + if (!br_opt_get(br, BROPT_VLAN_ENABLED) || + br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ @@ -712,7 +738,8 @@ static void recalculate_group_addr(struct net_bridge *br) /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { - if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) + if (!br_opt_get(br, BROPT_VLAN_ENABLED) || + br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & @@ -729,14 +756,14 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) }; int err; - if (br->vlan_enabled == val) + if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; err = switchdev_port_attr_set(br->dev, &attr); if (err && err != -EOPNOTSUPP) return err; - br->vlan_enabled = val; + br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); @@ -753,7 +780,7 @@ bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - return !!br->vlan_enabled; + return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); @@ -819,7 +846,31 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val) switch (val) { case 0: case 1: - br->vlan_stats_enabled = val; + br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); + break; + default: + return -EINVAL; + } + + return 0; +} + +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) +{ + struct net_bridge_port *p; + + /* allow to change the option if there are no port vlans configured */ + list_for_each_entry(p, &br->port_list, list) { + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + + if (vg->num_vlans) + return -EBUSY; + } + + switch (val) { + case 0: + case 1: + br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; @@ -877,8 +928,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) return 0; } - changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), - GFP_KERNEL); + changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; @@ -925,7 +975,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) br->default_pvid = pvid; out: - kfree(changed); + bitmap_free(changed); return err; err_port: @@ -965,7 +1015,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) goto out; /* Only allow default pvid change when filtering is disabled */ - if (br->vlan_enabled) { + if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; @@ -1019,7 +1069,7 @@ int nbp_vlan_init(struct net_bridge_port *p) .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, - .u.vlan_filtering = p->br->vlan_enabled, + .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index d18965f3291f..416717c57cd1 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file, __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index b82440e1fcb4..a931a71ef6df 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -264,9 +264,6 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) frontpkt = rearpkt; rearpkt = NULL; - err = -ENOMEM; - if (frontpkt == NULL) - goto out; err = -EPROTO; if (cfpkt_add_head(frontpkt, head, 6) < 0) goto out; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 02172c408ff2..5d6724cee38f 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -46,9 +46,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) goto fail; } - /* crypto_alloc_skcipher() allocates with GFP_KERNEL */ + /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); - key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); + key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); if (IS_ERR(key->tfm)) { ret = PTR_ERR(key->tfm); @@ -56,7 +56,7 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) goto fail; } - ret = crypto_skcipher_setkey(key->tfm, key->key, key->len); + ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); if (ret) goto fail; @@ -136,7 +136,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) if (key) { kfree(key->key); key->key = NULL; - crypto_free_skcipher(key->tfm); + crypto_free_sync_skcipher(key->tfm); key->tfm = NULL; } } @@ -216,7 +216,7 @@ static void teardown_sgtable(struct sg_table *sgt) static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { - SKCIPHER_REQUEST_ON_STACK(req, key->tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); @@ -232,7 +232,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); - skcipher_request_set_tfm(req, key->tfm); + skcipher_request_set_sync_tfm(req, key->tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index bb45c7d43739..96ef4d860bc9 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -13,7 +13,7 @@ struct ceph_crypto_key { struct ceph_timespec created; int len; void *key; - struct crypto_skcipher *tfm; + struct crypto_sync_skcipher *tfm; }; int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0a187196aeed..57fcc6b4bf6e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -156,7 +156,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; -static struct kmem_cache *ceph_msg_data_cache; /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; @@ -235,23 +234,11 @@ static int ceph_msgr_slab_init(void) if (!ceph_msg_cache) return -ENOMEM; - BUG_ON(ceph_msg_data_cache); - ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); - if (ceph_msg_data_cache) - return 0; - - kmem_cache_destroy(ceph_msg_cache); - ceph_msg_cache = NULL; - - return -ENOMEM; + return 0; } static void ceph_msgr_slab_exit(void) { - BUG_ON(!ceph_msg_data_cache); - kmem_cache_destroy(ceph_msg_data_cache); - ceph_msg_data_cache = NULL; - BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; @@ -526,7 +513,7 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) if (!buf) msg.msg_flags |= MSG_TRUNC; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -545,7 +532,7 @@ static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int r; BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -607,7 +594,7 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size); + iov_iter_bvec(&msg.msg_iter, WRITE, &bvec, 1, size); ret = sock_sendmsg(sock, &msg); if (ret == -EAGAIN) ret = 0; @@ -1141,16 +1128,13 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) { struct ceph_msg_data_cursor *cursor = &msg->cursor; - struct ceph_msg_data *data; BUG_ON(!length); BUG_ON(length > msg->data_length); - BUG_ON(list_empty(&msg->data)); + BUG_ON(!msg->num_data_items); - cursor->data_head = &msg->data; cursor->total_resid = length; - data = list_first_entry(&msg->data, struct ceph_msg_data, links); - cursor->data = data; + cursor->data = msg->data; __ceph_msg_data_cursor_init(cursor); } @@ -1231,8 +1215,7 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); - BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_next_entry(cursor->data, links); + cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1248,9 +1231,6 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { - BUG_ON(!msg); - BUG_ON(!data_len); - /* Initialize data cursor */ ceph_msg_data_cursor_init(msg, (size_t)data_len); @@ -1590,7 +1570,7 @@ static int write_partial_message_data(struct ceph_connection *con) dout("%s %p msg %p\n", __func__, con, msg); - if (list_empty(&msg->data)) + if (!msg->num_data_items) return -EINVAL; /* @@ -2347,8 +2327,7 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - BUG_ON(!msg); - if (list_empty(&msg->data)) + if (!msg->num_data_items) return -EIO; if (do_datacrc) @@ -3256,32 +3235,16 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, return false; } -static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) { - struct ceph_msg_data *data; - - if (WARN_ON(!ceph_msg_data_type_valid(type))) - return NULL; - - data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); - if (!data) - return NULL; - - data->type = type; - INIT_LIST_HEAD(&data->links); - - return data; + BUG_ON(msg->num_data_items >= msg->max_data_items); + return &msg->data[msg->num_data_items++]; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { - if (!data) - return; - - WARN_ON(!list_empty(&data->links)); if (data->type == CEPH_MSG_DATA_PAGELIST) ceph_pagelist_release(data->pagelist); - kmem_cache_free(ceph_msg_data_cache, data); } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, @@ -3292,13 +3255,12 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, BUG_ON(!pages); BUG_ON(!length); - data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_PAGES; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; - list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); @@ -3311,11 +3273,11 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg, BUG_ON(!pagelist); BUG_ON(!pagelist->length); - data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_PAGELIST; + refcount_inc(&pagelist->refcnt); data->pagelist = pagelist; - list_add_tail(&data->links, &msg->data); msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); @@ -3326,12 +3288,11 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, { struct ceph_msg_data *data; - data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_BIO; data->bio_pos = *bio_pos; data->bio_length = length; - list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); @@ -3342,11 +3303,10 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, { struct ceph_msg_data *data; - data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_BVECS; data->bvec_pos = *bvec_pos; - list_add_tail(&data->links, &msg->data); msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); @@ -3355,8 +3315,8 @@ EXPORT_SYMBOL(ceph_msg_data_add_bvecs); * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, - bool can_fail) +struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, + gfp_t flags, bool can_fail) { struct ceph_msg *m; @@ -3370,7 +3330,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); - INIT_LIST_HEAD(&m->data); /* front */ if (front_len) { @@ -3385,6 +3344,15 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, } m->front_alloc_len = m->front.iov_len = front_len; + if (max_data_items) { + m->data = kmalloc_array(max_data_items, sizeof(*m->data), + flags); + if (!m->data) + goto out2; + + m->max_data_items = max_data_items; + } + dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -3401,6 +3369,13 @@ out: } return NULL; } +EXPORT_SYMBOL(ceph_msg_new2); + +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, + bool can_fail) +{ + return ceph_msg_new2(type, front_len, 0, flags, can_fail); +} EXPORT_SYMBOL(ceph_msg_new); /* @@ -3496,13 +3471,14 @@ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); + kfree(m->data); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - struct ceph_msg_data *data, *next; + int i; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3515,11 +3491,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_for_each_entry_safe(data, next, &m->data, links) { - list_del_init(&data->links); - ceph_msg_data_destroy(data); - } - m->data_length = 0; + for (i = 0; i < m->num_data_items; i++) + ceph_msg_data_destroy(&m->data[i]); if (m->pool) ceph_msgpool_put(m->pool, m); diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 72571535883f..e3ecb80cd182 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -14,7 +14,8 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); + msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items, + gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -35,11 +36,13 @@ static void msgpool_free(void *element, void *arg) } int ceph_msgpool_init(struct ceph_msgpool *pool, int type, - int front_len, int size, bool blocking, const char *name) + int front_len, int max_data_items, int size, + const char *name) { dout("msgpool %s init\n", name); pool->type = type; pool->front_len = front_len; + pool->max_data_items = max_data_items; pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); if (!pool->pool) return -ENOMEM; @@ -53,18 +56,21 @@ void ceph_msgpool_destroy(struct ceph_msgpool *pool) mempool_destroy(pool->pool); } -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, - int front_len) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, + int max_data_items) { struct ceph_msg *msg; - if (front_len > pool->front_len) { - dout("msgpool_get %s need front %d, pool size is %d\n", - pool->name, front_len, pool->front_len); - WARN_ON(1); + if (front_len > pool->front_len || + max_data_items > pool->max_data_items) { + pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n", + __func__, front_len, max_data_items, pool->name, + pool->front_len, pool->max_data_items); + WARN_ON_ONCE(1); /* try to alloc a fresh message */ - return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); + return ceph_msg_new2(pool->type, front_len, max_data_items, + GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); @@ -80,6 +86,9 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) msg->front.iov_len = pool->front_len; msg->hdr.front_len = cpu_to_le32(pool->front_len); + msg->data_length = 0; + msg->num_data_items = 0; + kref_init(&msg->kref); /* retake single ref */ mempool_free(msg, pool->pool); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 60934bd8796c..d23a9f81f3d7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -126,6 +126,9 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data) osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } +/* + * Consumes @pages if @own_pages is true. + */ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -138,6 +141,9 @@ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, osd_data->own_pages = own_pages; } +/* + * Consumes a ref on @pagelist. + */ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { @@ -362,6 +368,8 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data) num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); ceph_release_page_vector(osd_data->pages, num_pages); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + ceph_pagelist_release(osd_data->pagelist); } ceph_osd_data_init(osd_data); } @@ -402,6 +410,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osd_data_release(&op->copy_from.osd_data); + break; default: break; } @@ -606,12 +617,15 @@ static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } -int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp, + int num_request_data_items, + int num_reply_data_items) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_msg *msg; int msg_size; + WARN_ON(req->r_request || req->r_reply); WARN_ON(ceph_oid_empty(&req->r_base_oid)); WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); @@ -633,9 +647,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size, + num_request_data_items); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); + msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size, + num_request_data_items, gfp, true); if (!msg) return -ENOMEM; @@ -648,9 +664,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); if (req->r_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size, + num_reply_data_items); else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); + msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size, + num_reply_data_items, gfp, true); if (!msg) return -ENOMEM; @@ -658,7 +676,6 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) return 0; } -EXPORT_SYMBOL(ceph_osdc_alloc_messages); static bool osd_req_opcode_valid(u16 opcode) { @@ -671,6 +688,65 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) } } +static void get_num_data_items(struct ceph_osd_request *req, + int *num_request_data_items, + int *num_reply_data_items) +{ + struct ceph_osd_req_op *op; + + *num_request_data_items = 0; + *num_reply_data_items = 0; + + for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { + switch (op->op) { + /* request */ + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_COPY_FROM: + *num_request_data_items += 1; + break; + + /* reply */ + case CEPH_OSD_OP_STAT: + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_LIST_WATCHERS: + *num_reply_data_items += 1; + break; + + /* both */ + case CEPH_OSD_OP_NOTIFY: + *num_request_data_items += 1; + *num_reply_data_items += 1; + break; + case CEPH_OSD_OP_CALL: + *num_request_data_items += 2; + *num_reply_data_items += 1; + break; + + default: + WARN_ON(!osd_req_opcode_valid(op->op)); + break; + } + } +} + +/* + * oid, oloc and OSD op opcode(s) must be filled in before this function + * is called. + */ +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +{ + int num_request_data_items, num_reply_data_items; + + get_num_data_items(req, &num_request_data_items, &num_reply_data_items); + return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items, + num_reply_data_items); +} +EXPORT_SYMBOL(ceph_osdc_alloc_messages); + /* * This is an osd op init function for opcodes that have no data or * other information associated with them. It also serves as a @@ -767,22 +843,19 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_dup_last); int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode, const char *class, const char *method) + const char *class, const char *method) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op; struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; - BUG_ON(opcode != CEPH_OSD_OP_CALL); + op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); - op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); @@ -815,12 +888,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); - payload_len = strlen(name); op->xattr.name_len = payload_len; ceph_pagelist_append(pagelist, name, payload_len); @@ -900,12 +971,6 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, static u32 osd_req_encode_op(struct ceph_osd_op *dst, const struct ceph_osd_req_op *src) { - if (WARN_ON(!osd_req_opcode_valid(src->op))) { - pr_err("unrecognized osd opcode %d\n", src->op); - - return 0; - } - switch (src->op) { case CEPH_OSD_OP_STAT: break; @@ -955,6 +1020,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; + case CEPH_OSD_OP_COPY_FROM: + dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); + dst->copy_from.src_version = + cpu_to_le64(src->copy_from.src_version); + dst->copy_from.flags = src->copy_from.flags; + dst->copy_from.src_fadvise_flags = + cpu_to_le32(src->copy_from.src_fadvise_flags); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1038,7 +1111,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; - r = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (num_ops > 1) + /* + * This is a special case for ceph_writepages_start(), but it + * also covers ceph_uninline_data(). If more multi-op request + * use cases emerge, we will need a separate helper. + */ + r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); + else + r = ceph_osdc_alloc_messages(req, GFP_NOFS); if (r) goto fail; @@ -1845,48 +1926,55 @@ static bool should_plug_request(struct ceph_osd_request *req) return true; } -static void setup_request_data(struct ceph_osd_request *req, - struct ceph_msg *msg) +/* + * Keep get_num_data_items() in sync with this function. + */ +static void setup_request_data(struct ceph_osd_request *req) { - u32 data_len = 0; - int i; + struct ceph_msg *request_msg = req->r_request; + struct ceph_msg *reply_msg = req->r_reply; + struct ceph_osd_req_op *op; - if (!list_empty(&msg->data)) + if (req->r_request->num_data_items || req->r_reply->num_data_items) return; - WARN_ON(msg->data_length); - for (i = 0; i < req->r_num_ops; i++) { - struct ceph_osd_req_op *op = &req->r_ops[i]; - + WARN_ON(request_msg->data_length || reply_msg->data_length); + for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: WARN_ON(op->indata_len != op->extent.length); - ceph_osdc_msg_data_add(msg, &op->extent.osd_data); + ceph_osdc_msg_data_add(request_msg, + &op->extent.osd_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: WARN_ON(op->indata_len != op->xattr.name_len + op->xattr.value_len); - ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); + ceph_osdc_msg_data_add(request_msg, + &op->xattr.osd_data); break; case CEPH_OSD_OP_NOTIFY_ACK: - ceph_osdc_msg_data_add(msg, + ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osdc_msg_data_add(request_msg, + &op->copy_from.osd_data); + break; /* reply */ case CEPH_OSD_OP_STAT: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->raw_data_in); break; case CEPH_OSD_OP_READ: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_LIST_WATCHERS: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->list_watchers.response_data); break; @@ -1895,25 +1983,23 @@ static void setup_request_data(struct ceph_osd_request *req, WARN_ON(op->indata_len != op->cls.class_len + op->cls.method_len + op->cls.indata_len); - ceph_osdc_msg_data_add(msg, &op->cls.request_info); + ceph_osdc_msg_data_add(request_msg, + &op->cls.request_info); /* optional, can be NONE */ - ceph_osdc_msg_data_add(msg, &op->cls.request_data); + ceph_osdc_msg_data_add(request_msg, + &op->cls.request_data); /* optional, can be NONE */ - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->cls.response_data); break; case CEPH_OSD_OP_NOTIFY: - ceph_osdc_msg_data_add(msg, + ceph_osdc_msg_data_add(request_msg, &op->notify.request_data); - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->notify.response_data); break; } - - data_len += op->indata_len; } - - WARN_ON(data_len != msg->data_length); } static void encode_pgid(void **p, const struct ceph_pg *pgid) @@ -1961,7 +2047,7 @@ static void encode_request_partial(struct ceph_osd_request *req, req->r_data_offset || req->r_snapc); } - setup_request_data(req, msg); + setup_request_data(req); encode_spgid(&p, &req->r_t.spgid); /* actual spg */ ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ @@ -3001,11 +3087,21 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; + down_write(&osdc->lock); + linger_register(lreq); + if (lreq->is_watch) { + lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; + lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; + } else { + lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; + } + calc_target(osdc, &lreq->t, NULL, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); send_linger(lreq); + up_write(&osdc->lock); } static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) @@ -4318,9 +4414,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, lreq->notify_id, notify_id); } else if (!completion_done(&lreq->notify_finish_wait)) { struct ceph_msg_data *data = - list_first_entry_or_null(&msg->data, - struct ceph_msg_data, - links); + msg->num_data_items ? &msg->data[0] : NULL; if (data) { if (lreq->preply_pages) { @@ -4476,6 +4570,23 @@ alloc_linger_request(struct ceph_osd_linger_request *lreq) ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + return req; +} + +static struct ceph_osd_request * +alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) +{ + struct ceph_osd_request *req; + + req = alloc_linger_request(lreq); + if (!req) + return NULL; + + /* + * Pass 0 for cookie because we don't know it yet, it will be + * filled in by linger_submit(). + */ + osd_req_op_watch_init(req, 0, 0, watch_opcode); if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { ceph_osdc_put_request(req); @@ -4514,27 +4625,19 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); - lreq->reg_req = alloc_linger_request(lreq); + lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); if (!lreq->reg_req) { ret = -ENOMEM; goto err_put_lreq; } - lreq->ping_req = alloc_linger_request(lreq); + lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); if (!lreq->ping_req) { ret = -ENOMEM; goto err_put_lreq; } - down_write(&osdc->lock); - linger_register(lreq); /* before osd_req_op_* */ - osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_WATCH); - osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_PING); linger_submit(lreq); - up_write(&osdc->lock); - ret = linger_reg_commit_wait(lreq); if (ret) { linger_cancel(lreq); @@ -4599,11 +4702,10 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); - pl = kmalloc(sizeof(*pl), GFP_NOIO); + pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; - ceph_pagelist_init(pl); ret = ceph_pagelist_encode_64(pl, notify_id); ret |= ceph_pagelist_encode_64(pl, cookie); if (payload) { @@ -4641,12 +4743,12 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, + payload_len); if (ret) goto out_put_req; - ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, - payload_len); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; @@ -4670,11 +4772,10 @@ static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; - pl = kmalloc(sizeof(*pl), GFP_NOIO); + pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; - ceph_pagelist_init(pl); ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ ret |= ceph_pagelist_encode_32(pl, timeout); ret |= ceph_pagelist_encode_32(pl, payload_len); @@ -4733,29 +4834,30 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, goto out_put_lreq; } + /* + * Pass 0 for cookie because we don't know it yet, it will be + * filled in by linger_submit(). + */ + ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, + payload, payload_len); + if (ret) + goto out_put_lreq; + /* for notify_id */ pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out_put_lreq; } - - down_write(&osdc->lock); - linger_register(lreq); /* before osd_req_op_* */ - ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, - timeout, payload, payload_len); - if (ret) { - linger_unregister(lreq); - up_write(&osdc->lock); - ceph_release_page_vector(pages, 1); - goto out_put_lreq; - } ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, response_data), pages, PAGE_SIZE, 0, false, true); - linger_submit(lreq); - up_write(&osdc->lock); + ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); + if (ret) + goto out_put_lreq; + + linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) ret = linger_notify_finish_wait(lreq); @@ -4881,10 +4983,6 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); - if (ret) - goto out_put_req; - pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); @@ -4896,6 +4994,10 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, response_data), pages, PAGE_SIZE, 0, false, true); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { @@ -4958,11 +5060,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = flags; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); - if (ret) - goto out_put_req; - - ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + ret = osd_req_op_cls_init(req, 0, class, method); if (ret) goto out_put_req; @@ -4973,6 +5071,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, osd_req_op_cls_response_data_pages(req, 0, &resp_page, *resp_len, 0, false, false); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { @@ -5021,11 +5123,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, - PAGE_SIZE, 10, true, "osd_op"); + PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, - PAGE_SIZE, 10, true, "osd_op_reply"); + PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, + "osd_op_reply"); if (err < 0) goto out_msgpool; @@ -5168,6 +5271,80 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +static int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u8 copy_from_flags) +{ + struct ceph_osd_req_op *op; + struct page **pages; + void *p, *end; + + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); + op->copy_from.snapid = src_snapid; + op->copy_from.src_version = src_version; + op->copy_from.flags = copy_from_flags; + op->copy_from.src_fadvise_flags = src_fadvise_flags; + + p = page_address(pages[0]); + end = p + PAGE_SIZE; + ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); + encode_oloc(&p, end, src_oloc); + op->indata_len = PAGE_SIZE - (end - p); + + ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, + op->indata_len, 0, false, true); + return 0; +} + +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u32 dst_fadvise_flags, + u8 copy_from_flags) +{ + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->r_flags = CEPH_OSD_FLAG_WRITE; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, + src_oloc, src_fadvise_flags, + dst_fadvise_flags, copy_from_flags); + if (ret) + goto out; + + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); + if (ret) + goto out; + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + +out: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_copy_from); + int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + @@ -5295,7 +5472,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) u32 front_len = le32_to_cpu(hdr->front_len); u32 data_len = le32_to_cpu(hdr->data_len); - m = ceph_msg_new(type, front_len, GFP_NOIO, false); + m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false); if (!m) return NULL; diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 2ea0564771d2..65e34f78b05d 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -6,6 +6,26 @@ #include #include +struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags) +{ + struct ceph_pagelist *pl; + + pl = kmalloc(sizeof(*pl), gfp_flags); + if (!pl) + return NULL; + + INIT_LIST_HEAD(&pl->head); + pl->mapped_tail = NULL; + pl->length = 0; + pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; + refcount_set(&pl->refcnt, 1); + + return pl; +} +EXPORT_SYMBOL(ceph_pagelist_alloc); + static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) { if (pl->mapped_tail) { diff --git a/net/compat.c b/net/compat.c index 3b2105f6549d..47a614b370cd 100644 --- a/net/compat.c +++ b/net/compat.c @@ -812,21 +812,21 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, - struct compat_timespec __user *timeout) + struct old_timespec32 __user *timeout) { int datagrams; - struct timespec ktspec; + struct timespec64 ktspec; if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); - if (compat_get_timespec(&ktspec, timeout)) + if (compat_get_timespec64(&ktspec, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) + if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout)) datagrams = -EFAULT; return datagrams; @@ -834,7 +834,7 @@ static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, - struct compat_timespec __user *, timeout) + struct old_timespec32 __user *, timeout) { return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); } diff --git a/net/core/Makefile b/net/core/Makefile index 80175e6a2eb8..fccd31e0e7f7 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o +obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 9aac0d63d53e..57f3a6fcfc1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -808,8 +808,9 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, return -EINVAL; } - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) - netdev_rx_csum_fault(skb->dev); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(NULL); } return 0; fault: @@ -837,7 +838,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/dev.c b/net/core/dev.c index 82114e1111e6..0ffcbdd55fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1752,6 +1752,28 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); +/** + * call_netdevice_notifiers_mtu - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @arg: additional u32 argument passed to the notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ +static int call_netdevice_notifiers_mtu(unsigned long val, + struct net_device *dev, u32 arg) +{ + struct netdev_notifier_info_ext info = { + .info.dev = dev, + .ext.mtu = arg, + }; + + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); + + return call_netdevice_notifiers_info(val, &info.info); +} + #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); @@ -1954,6 +1976,17 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) return false; } +/** + * dev_nit_active - return true if any network interface taps are in use + * + * @dev: network device to check for the presence of taps + */ +bool dev_nit_active(struct net_device *dev) +{ + return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); +} +EXPORT_SYMBOL_GPL(dev_nit_active); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1969,6 +2002,9 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->ignore_outgoing) + continue; + /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ @@ -3208,7 +3244,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) + if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -3228,7 +3264,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de while (skb) { struct sk_buff *next = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; @@ -3236,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } @@ -3328,7 +3364,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d for (; skb != NULL; skb = next) { next = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); /* in case skb wont be segmented, point to itself */ skb->prev = skb; @@ -4255,6 +4291,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + __be16 orig_eth_type; + struct ethhdr *eth; + bool orig_bcast; int hlen, off; u32 mac_len; @@ -4295,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_hard_start = skb->data - skb_headroom(skb); orig_data_end = xdp->data_end; orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); xdp->rxq = &rxqueue->xdp_rxq; @@ -4318,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, } + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + } + switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -5292,8 +5342,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - list_del(&skb->list); - skb->next = NULL; + skb_list_del_init(skb); napi_gro_complete(skb); napi->gro_hash[index].count--; } @@ -5408,7 +5457,7 @@ static void gro_flush_oldest(struct list_head *head) /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ - list_del(&oldest->list); + skb_list_del_init(oldest); napi_gro_complete(oldest); } @@ -5478,8 +5527,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - list_del(&pp->list); - pp->next = NULL; + skb_list_del_init(pp); napi_gro_complete(pp); napi->gro_hash[hash].count--; } @@ -7574,14 +7622,16 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, err = __dev_set_mtu(dev, new_mtu); if (!err) { - err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + new_mtu); } } return err; diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c0ed225e280..3a4b29a13d31 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1626,7 +1626,7 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = ops->eswitch_mode_set(devlink, mode); + err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; } @@ -1636,7 +1636,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, return -EOPNOTSUPP; inline_mode = nla_get_u8( info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); - err = ops->eswitch_inline_mode_set(devlink, inline_mode); + err = ops->eswitch_inline_mode_set(devlink, inline_mode, + info->extack); if (err) return err; } @@ -1645,7 +1646,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_encap_mode_set) return -EOPNOTSUPP; encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); - err = ops->eswitch_encap_mode_set(devlink, encap_mode); + err = ops->eswitch_encap_mode_set(devlink, encap_mode, + info->extack); if (err) return err; } @@ -2675,6 +2677,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, + .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -2995,6 +3012,8 @@ devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { + int len; + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) return -EINVAL; @@ -3010,10 +3029,13 @@ devlink_param_value_get_from_info(const struct devlink_param *param, value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); break; case DEVLINK_PARAM_TYPE_STRING: - if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > - DEVLINK_PARAM_MAX_STRING_VALUE) + len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]), + nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); + if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) || + len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; - value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + strcpy(value->vstr, + nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); break; case DEVLINK_PARAM_TYPE_BOOL: value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? @@ -3100,7 +3122,10 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { - param_item->driverinit_value = value; + if (param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, value.vstr); + else + param_item->driverinit_value = value; param_item->driverinit_value_valid = true; } else { if (!param->set) @@ -3487,7 +3512,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, start_offset = *((u64 *)&cb->args[0]); err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack); if (err) goto out; @@ -4540,7 +4565,10 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, DEVLINK_PARAM_CMODE_DRIVERINIT)) return -EOPNOTSUPP; - *init_val = param_item->driverinit_value; + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(init_val->vstr, param_item->driverinit_value.vstr); + else + *init_val = param_item->driverinit_value; return 0; } @@ -4571,7 +4599,10 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, DEVLINK_PARAM_CMODE_DRIVERINIT)) return -EOPNOTSUPP; - param_item->driverinit_value = init_val; + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, init_val.vstr); + else + param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); @@ -4603,6 +4634,23 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) } EXPORT_SYMBOL_GPL(devlink_param_value_changed); +/** + * devlink_param_value_str_fill - Safely fill-up the string preventing + * from overflow of the preallocated buffer + * + * @dst_val: destination devlink_param_value + * @src: source buffer + */ +void devlink_param_value_str_fill(union devlink_param_value *dst_val, + const char *src) +{ + size_t len; + + len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE); + WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE); +} +EXPORT_SYMBOL_GPL(devlink_param_value_str_fill); + /** * devlink_region_create - create a new address region * diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 234a0ec2e932..d05402868575 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Some useful ethtool_ops methods that're device independent. @@ -539,47 +540,17 @@ struct ethtool_link_usettings { } link_modes; }; -/* Internal kernel helper to query a device ethtool_link_settings. - * - * Backward compatibility note: for compatibility with legacy drivers - * that implement only the ethtool_cmd API, this has to work with both - * drivers implementing get_link_ksettings API and drivers - * implementing get_settings API. When drivers implement get_settings - * and report ethtool_cmd deprecated fields - * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored - * because the resulting struct ethtool_link_settings does not report them. - */ +/* Internal kernel helper to query a device ethtool_link_settings. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { - int err; - struct ethtool_cmd cmd; - ASSERT_RTNL(); - if (dev->ethtool_ops->get_link_ksettings) { - memset(link_ksettings, 0, sizeof(*link_ksettings)); - return dev->ethtool_ops->get_link_ksettings(dev, - link_ksettings); - } - - /* driver doesn't support %ethtool_link_ksettings API. revert to - * legacy %ethtool_cmd API, unless it's not supported either. - * TODO: remove when ethtool_ops::get_settings disappears internally - */ - if (!dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; - memset(&cmd, 0, sizeof(cmd)); - cmd.cmd = ETHTOOL_GSET; - err = dev->ethtool_ops->get_settings(dev, &cmd); - if (err < 0) - return err; - - /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt - */ - convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); - return err; + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } EXPORT_SYMBOL(__ethtool_get_link_ksettings); @@ -635,16 +606,7 @@ store_link_ksettings_for_user(void __user *to, return 0; } -/* Query device for its ethtool_link_settings. - * - * Backward compatibility note: this function must fail when driver - * does not implement ethtool::get_link_ksettings, even if legacy - * ethtool_ops::get_settings is implemented. This tells new versions - * of ethtool that they should use the legacy API %ETHTOOL_GSET for - * this driver, so that they can correctly access the ethtool_cmd - * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver - * implements ethtool_ops::get_settings anymore. - */ +/* Query device for its ethtool_link_settings. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { @@ -652,7 +614,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); - if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; @@ -699,16 +660,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev, return store_link_ksettings_for_user(useraddr, &link_ksettings); } -/* Update device ethtool_link_settings. - * - * Backward compatibility note: this function must fail when driver - * does not implement ethtool::set_link_ksettings, even if legacy - * ethtool_ops::set_settings is implemented. This tells new versions - * of ethtool that they should use the legacy API %ETHTOOL_SSET for - * this driver, so that they can correctly update the ethtool_cmd - * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver - * implements ethtool_ops::get_settings anymore. - */ +/* Update device ethtool_link_settings. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { @@ -746,51 +698,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev, /* Query device for its ethtool_cmd settings. * - * Backward compatibility note: for compatibility with legacy ethtool, - * this has to work with both drivers implementing get_link_ksettings - * API and drivers implementing get_settings API. When drivers - * implement get_link_ksettings and report higher link mode bits, a - * kernel warning is logged once (with name of 1st driver/device) to - * recommend user to upgrade ethtool, but the command is successful - * (only the lower link mode bits reported back to user). + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now implemented via get_link_ksettings. When driver reports higher link mode + * bits, a kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful (only the + * lower link mode bits reported back to user). Deprecated fields from + * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { + struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; + int err; ASSERT_RTNL(); + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; - if (dev->ethtool_ops->get_link_ksettings) { - /* First, use link_ksettings API if it is supported */ - int err; - struct ethtool_link_ksettings link_ksettings; - - memset(&link_ksettings, 0, sizeof(link_ksettings)); - err = dev->ethtool_ops->get_link_ksettings(dev, - &link_ksettings); - if (err < 0) - return err; - convert_link_ksettings_to_legacy_settings(&cmd, - &link_ksettings); - - /* send a sensible cmd tag back to user */ - cmd.cmd = ETHTOOL_GSET; - } else { - /* driver doesn't support %ethtool_link_ksettings - * API. revert to legacy %ethtool_cmd API, unless it's - * not supported either. - */ - int err; - - if (!dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); + if (err < 0) + return err; + convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); - memset(&cmd, 0, sizeof(cmd)); - cmd.cmd = ETHTOOL_GSET; - err = dev->ethtool_ops->get_settings(dev, &cmd); - if (err < 0) - return err; - } + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; @@ -800,48 +732,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) /* Update device link settings with given ethtool_cmd. * - * Backward compatibility note: for compatibility with legacy ethtool, - * this has to work with both drivers implementing set_link_ksettings - * API and drivers implementing set_settings API. When drivers - * implement set_link_ksettings and user's request updates deprecated - * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel - * warning is logged once (with name of 1st driver/device) to - * recommend user to upgrade ethtool, and the request is rejected. + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now always implemented via set_link_settings. When user's request updates + * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to recommend user to + * upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { + struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; - - /* first, try new %ethtool_link_ksettings API. */ - if (dev->ethtool_ops->set_link_ksettings) { - struct ethtool_link_ksettings link_ksettings; - - if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, - &cmd)) - return -EINVAL; - - link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; - link_ksettings.base.link_mode_masks_nwords - = __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, - &link_ksettings); - } - - /* legacy %ethtool_cmd API */ - - /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings - * disappears internally - */ - - if (!dev->ethtool_ops->set_settings) + if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; - return dev->ethtool_ops->set_settings(dev, &cmd); + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) + return -EINVAL; + link_ksettings.base.link_mode_masks_nwords = + __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, @@ -1015,6 +928,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, return -EINVAL; } + if (info.cmd != cmd) + return -EINVAL; + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) @@ -1483,6 +1399,7 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; + int ret; if (!dev->ethtool_ops->set_wol) return -EOPNOTSUPP; @@ -1490,7 +1407,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; - return dev->ethtool_ops->set_wol(dev, &wol); + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + return ret; + + dev->wol_enabled = !!wol.wolopts; + + return 0; } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) @@ -1743,8 +1666,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; + struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; + u16 from_channel, to_channel; u32 max_rx_in_use = 0; + unsigned int i; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; @@ -1752,13 +1677,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; - dev->ethtool_ops->get_channels(dev, &max); + dev->ethtool_ops->get_channels(dev, &curr); /* ensure new counts are within the maximums */ - if ((channels.rx_count > max.max_rx) || - (channels.tx_count > max.max_tx) || - (channels.combined_count > max.max_combined) || - (channels.other_count > max.max_other)) + if (channels.rx_count > curr.max_rx || + channels.tx_count > curr.max_tx || + channels.combined_count > curr.max_combined || + channels.other_count > curr.max_other) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow @@ -1768,6 +1693,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, (channels.combined_count + channels.rx_count) <= max_rx_in_use) return -EINVAL; + /* Disabling channels, query zero-copy AF_XDP sockets */ + from_channel = channels.combined_count + + min(channels.rx_count, channels.tx_count); + to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); + for (i = from_channel; i < to_channel; i++) + if (xdp_get_umem_from_qid(dev, i)) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -2462,13 +2395,17 @@ roll_back: return ret; } -static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +static int ethtool_set_per_queue(struct net_device *dev, + void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; + if (per_queue_opt.sub_command != sub_cmd) + return -EINVAL; + switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); @@ -2839,7 +2776,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: - rc = ethtool_set_per_queue(dev, useraddr); + rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 0ff3953f64aa..ffbb827723a2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1063,13 +1063,47 @@ skip: return err; } +static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct fib_rule_hdr *frh; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); + return -EINVAL; + } + + frh = nlmsg_data(nlh); + if (frh->dst_len || frh->src_len || frh->tos || frh->table || + frh->res1 || frh->res2 || frh->action || frh->flags) { + NL_SET_ERR_MSG(extack, + "Invalid values in header for fib rule dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); + return -EINVAL; + } + + return 0; +} + static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; - family = rtnl_msg_family(cb->nlh); + if (cb->strict_check) { + int err = fib_valid_dumprule_req(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); diff --git a/net/core/filter.c b/net/core/filter.c index 5e00f2b85a56..e521c5ebc7d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -58,13 +59,17 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -2138,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, - struct bpf_map *, map, void *, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { - .func = bpf_sk_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, - struct bpf_map *, map, u32, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_sk_redirect_map(struct sk_buff *skb) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - return tcb->bpf.sk_redir; -} - -static const struct bpf_func_proto bpf_sk_redirect_map_proto = { - .func = bpf_sk_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, - struct bpf_map *, map, void *, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { - .func = bpf_msg_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) -{ - return msg->sk_redir; -} - -static const struct bpf_func_proto bpf_msg_redirect_map_proto = { - .func = bpf_msg_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; @@ -2268,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; @@ -2282,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -#define sk_msg_iter_var(var) \ - do { \ - var++; \ - if (var == MAX_SKB_FRAGS) \ - var = 0; \ - } while (0) - -BPF_CALL_4(bpf_msg_pull_data, - struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0, poffset = 0; - int bytes = end - start, bytes_sg_total; - struct scatterlist *sg = msg->sg_data; - int first_sg, last_sg, i, shift; - unsigned char *p, *to, *from; + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ - i = msg->sg_start; + i = msg->sg.start; do { - len = sg[i].length; + len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; offset += len; - sk_msg_iter_var(i); - } while (i != msg->sg_end); + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; - first_sg = i; + first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg_copy[i] && bytes_sg_total <= len) + if (!msg->sg.copy[i] && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2334,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data, * will copy the entire sg entry. */ do { - copy += sg[i].length; - sk_msg_iter_var(i); + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; - } while (i != msg->sg_end); - last_sg = i; + } while (i != msg->sg.end); + last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; @@ -2348,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data, get_order(copy)); if (unlikely(!page)) return -ENOMEM; - p = page_address(page); - i = first_sg; + raw = page_address(page); + i = first_sge; do { - from = sg_virt(&sg[i]); - len = sg[i].length; - to = p + poffset; + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; memcpy(to, from, len); poffset += len; - sg[i].length = 0; - put_page(sg_page(&sg[i])); + sge->length = 0; + put_page(sg_page(sge)); - sk_msg_iter_var(i); - } while (i != last_sg); + sk_msg_iter_var_next(i); + } while (i != last_sge); - sg[first_sg].length = copy; - sg_set_page(&sg[first_sg], page, copy, 0); + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - WARN_ON_ONCE(last_sg == first_sg); - shift = last_sg > first_sg ? - last_sg - first_sg - 1 : - MAX_SKB_FRAGS - first_sg + last_sg - 1; + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + MAX_SKB_FRAGS - first_sge + last_sge - 1; if (!shift) goto out; - i = first_sg; - sk_msg_iter_var(i); + i = first_sge; + sk_msg_iter_var_next(i); do { - int move_from; + u32 move_from; - if (i + shift >= MAX_SKB_FRAGS) - move_from = i + shift - MAX_SKB_FRAGS; + if (i + shift >= MAX_MSG_FRAGS) + move_from = i + shift - MAX_MSG_FRAGS; else move_from = i + shift; - - if (move_from == msg->sg_end) + if (move_from == msg->sg.end) break; - sg[i] = sg[move_from]; - sg[move_from].length = 0; - sg[move_from].page_link = 0; - sg[move_from].offset = 0; - - sk_msg_iter_var(i); + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); } while (1); - msg->sg_end -= shift; - if (msg->sg_end < 0) - msg->sg_end += MAX_SKB_FRAGS; + + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift; out: - msg->data = sg_virt(&sg[first_sg]) + start - offset; + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; - return 0; } @@ -2418,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + msg->sg.copy[new] = false; + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3176,6 +3186,32 @@ static int __bpf_tx_xdp(struct net_device *dev, return 0; } +static noinline int +xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) +{ + struct net_device *fwd; + u32 index = ri->ifindex; + int err; + + fwd = dev_get_by_index_rcu(dev_net(dev), index); + ri->ifindex = 0; + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = __bpf_tx_xdp(fwd, NULL, xdp, 0); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect(dev, xdp_prog, index); + return 0; +err: + _trace_xdp_redirect_err(dev, xdp_prog, index, err); + return err; +} + static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp, @@ -3188,7 +3224,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; __dev_map_insert_ctx(map, index); break; @@ -3197,7 +3233,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; __cpu_map_insert_ctx(map, index); break; @@ -3238,7 +3274,7 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); -static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: @@ -3270,9 +3306,9 @@ void bpf_clear_redirect_map(struct bpf_map *map) } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map) + struct bpf_prog *xdp_prog, struct bpf_map *map, + struct bpf_redirect_info *ri) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; void *fwd = NULL; int err; @@ -3281,11 +3317,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); - if (!fwd) { + if (unlikely(!fwd)) { err = -EINVAL; goto err; } - if (ri->map_to_flush && ri->map_to_flush != map) + if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); @@ -3305,29 +3341,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - struct net_device *fwd; - u32 index = ri->ifindex; - int err; - - if (map) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map); - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; + if (likely(map)) + return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; + return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -3915,8 +3933,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; - case SO_MAX_PACING_RATE: - sk->sk_max_pacing_rate = val; + case SO_MAX_PACING_RATE: /* 32bit version */ + sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; @@ -4013,6 +4031,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, tp->snd_ssthresh = val; } break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + ret = -EINVAL; + else + tp->save_syn = val; + break; default: ret = -EINVAL; } @@ -4042,17 +4066,29 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, if (!sk_fullsock(sk)) goto err_clear; - #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - if (optname == TCP_CONGESTION) { - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk; + struct tcp_sock *tp; + + switch (optname) { + case TCP_CONGESTION: + icsk = inet_csk(sk); if (!icsk->icsk_ca_ops || optlen <= 1) goto err_clear; strncpy(optval, icsk->icsk_ca_ops->name, optlen); optval[optlen - 1] = 0; - } else { + break; + case TCP_SAVED_SYN: + tp = tcp_sk(sk); + + if (optlen <= 0 || !tp->saved_syn || + optlen > tp->saved_syn[0]) + goto err_clear; + memcpy(optval, tp->saved_syn + 1, optlen); + break; + default: goto err_clear; } } else if (level == SOL_IP) { @@ -4787,6 +4823,149 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + bool refcounted = false; + struct sock *sk = NULL; + int dif = 0; + + if (skb->dev) + dif = skb->dev->ifindex; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + u16 hnum = ntohs(tuple->ipv6.dport); + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, hnum, + dif, sdif, &refcounted); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, hnum, + dif, sdif, + &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; +#endif /* CONFIG_INET */ + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -4806,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || + func == bpf_msg_push_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -4828,6 +5008,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: @@ -4992,6 +5178,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5025,6 +5219,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5048,6 +5245,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5062,13 +5262,16 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; default: return bpf_base_func_proto(func_id); } } +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5091,8 +5294,25 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; default: return bpf_base_func_proto(func_id); } @@ -5216,6 +5436,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + if (size != sizeof(struct bpf_flow_keys *)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5241,6 +5465,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5257,6 +5482,46 @@ static bool sk_filter_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +static bool cg_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): + return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -5266,6 +5531,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5351,23 +5617,38 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_attach_type(off, type, - prog->expected_attach_type)) - return false; if (!__sock_filter_check_size(off, size, info)) return false; return true; } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -5476,6 +5757,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5677,6 +5959,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5736,6 +6019,39 @@ static bool sk_msg_is_valid_access(int off, int size, return true; } +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + info->reg_type = PTR_TO_FLOW_KEYS; + break; + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -6030,15 +6346,24 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; + + case offsetof(struct __sk_buff, flow_keys): + off = si->off; + off -= offsetof(struct __sk_buff, flow_keys); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, flow_keys); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; } return insn - insn_buf; } -static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; @@ -6748,22 +7073,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct sk_msg_md, data): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data)); + offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data_end)); + offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; @@ -6772,9 +7097,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; @@ -6784,9 +7109,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); @@ -6801,9 +7126,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + @@ -6822,9 +7147,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + @@ -6838,9 +7163,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD @@ -6852,9 +7177,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; @@ -6890,6 +7215,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -6898,7 +7224,7 @@ const struct bpf_prog_ops xdp_prog_ops = { const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, - .is_valid_access = sk_filter_is_valid_access, + .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; @@ -6950,7 +7276,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = { const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, - .convert_ctx_access = sock_filter_convert_ctx_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { @@ -6988,11 +7314,21 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ce9eeeb7c024..676f3ad629f9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,6 +25,9 @@ #include #include #include +#include + +static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (attached) { + /* Only one BPF program can be attached at a time */ + mutex_unlock(&flow_dissector_mutex); + return -EEXIST; + } + rcu_assign_pointer(net->flow_dissector_prog, prog); + mutex_unlock(&flow_dissector_mutex); + return 0; +} + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (!attached) { + mutex_unlock(&flow_dissector_mutex); + return -ENOENT; + } + bpf_prog_put(attached); + RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + mutex_unlock(&flow_dissector_mutex); + return 0; +} /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -382,8 +423,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) - offset += sizeof(((struct gre_full_hdr *) 0)->csum) + - sizeof(((struct gre_full_hdr *) 0)->reserved1); + offset += FIELD_SIZEOF(struct gre_full_hdr, csum) + + FIELD_SIZEOF(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; @@ -405,11 +446,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - offset += sizeof(((struct gre_full_hdr *) 0)->key); + offset += FIELD_SIZEOF(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) - offset += sizeof(((struct pptp_gre_header *) 0)->seq); + offset += FIELD_SIZEOF(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { @@ -436,7 +477,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, u8 *ppp_hdr; if (hdr->flags & GRE_ACK) - offset += sizeof(((struct pptp_gre_header *) 0)->ack); + offset += FIELD_SIZEOF(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -658,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { + if (skb->dev) + attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); + else if (skb->sk) + attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); + else + WARN_ON_ONCE(1); + } + if (attached) { + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + struct bpf_flow_keys flow_keys = {}; + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(cb_saved)); + + /* Pass parameters to the BPF program */ + cb->qdisc_cb.flow_keys = &flow_keys; + flow_keys.nhoff = nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(attached, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + key_control->thoff = min_t(u16, key_control->thoff, skb->len); + rcu_read_unlock(); + return result == BPF_OK; + } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 188d693cb251..9bf1b9ad1780 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -162,6 +162,34 @@ __gnet_stats_copy_basic(const seqcount_t *running, } EXPORT_SYMBOL(__gnet_stats_copy_basic); +static int +___gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b, + int type) +{ + struct gnet_stats_basic_packed bstats = {0}; + + __gnet_stats_copy_basic(running, &bstats, cpu, b); + + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { + d->tc_stats.bytes = bstats.bytes; + d->tc_stats.packets = bstats.packets; + } + + if (d->tail) { + struct gnet_stats_basic sb; + + memset(&sb, 0, sizeof(sb)); + sb.bytes = bstats.bytes; + sb.packets = bstats.packets; + return gnet_stats_copy(d, type, &sb, sizeof(sb), + TCA_STATS_PAD); + } + return 0; +} + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @running: seqcount_t pointer @@ -181,28 +209,35 @@ gnet_stats_copy_basic(const seqcount_t *running, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { - struct gnet_stats_basic_packed bstats = {0}; - - __gnet_stats_copy_basic(running, &bstats, cpu, b); - - if (d->compat_tc_stats) { - d->tc_stats.bytes = bstats.bytes; - d->tc_stats.packets = bstats.packets; - } - - if (d->tail) { - struct gnet_stats_basic sb; - - memset(&sb, 0, sizeof(sb)); - sb.bytes = bstats.bytes; - sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), - TCA_STATS_PAD); - } - return 0; + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC); } EXPORT_SYMBOL(gnet_stats_copy_basic); +/** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV + * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic_hw(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC_HW); +} +EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle diff --git a/net/core/link_watch.c b/net/core/link_watch.c index e38e641e98d5..7f51efb2b3ab 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -155,7 +155,7 @@ static void linkwatch_do_dev(struct net_device *dev) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); - if (dev->flags & IFF_UP) { + if (dev->flags & IFF_UP && netif_device_present(dev)) { if (netif_carrier_ok(dev)) dev_activate(dev); else diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 91592fceeaad..41954e42a2de 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list) } } -static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { int i; struct neigh_hash_table *nht; @@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) np = &n->next; continue; } + if (skip_perm && n->nud_state & NUD_PERMANENT) { + np = &n->next; + continue; + } rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); @@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); -int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); return 0; } + +int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, true); + return 0; +} +EXPORT_SYMBOL(neigh_carrier_down); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, false); + return 0; +} EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) @@ -1148,8 +1167,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->nud_state = new; err = 0; notify = old & NUD_VALID; - if (((old & (NUD_INCOMPLETE | NUD_PROBE)) || - (flags & NEIGH_UPDATE_F_ADMIN)) && + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; @@ -1280,11 +1298,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->arp_queue_len_bytes = 0; } out: - if (update_isrouter) { - neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? - (neigh->flags | NTF_ROUTER) : - (neigh->flags & ~NTF_ROUTER); - } + if (update_isrouter) + neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); if (notify) @@ -1712,7 +1727,8 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1787,12 +1803,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; + flags &= ~(NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm->ndm_flags & NTF_ROUTER) + flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; @@ -2162,15 +2182,47 @@ errout: return err; } +static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ndtmsg *ndtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); + return -EINVAL; + } + + ndtm = nlmsg_data(nlh); + if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); + return -EINVAL; + } + + return 0; +} + static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + if (cb->strict_check) { + int err = neightbl_valid_dump_info(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; @@ -2183,7 +2235,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; @@ -2198,7 +2250,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; @@ -2312,7 +2364,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = netdev_master_upper_dev_get(dev); + master = dev ? netdev_master_upper_dev_get(dev) : NULL; if (!master || master->ifindex != master_idx) return true; @@ -2321,41 +2373,30 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { - if (filter_idx && dev->ifindex != filter_idx) + if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } +struct neigh_dump_filter { + int master_idx; + int dev_idx; +}; + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); - const struct nlmsghdr *nlh = cb->nlh; - struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; - int filter_master_idx = 0, filter_idx = 0; unsigned int flags = NLM_F_MULTI; - int err; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); - if (!err) { - if (tb[NDA_IFINDEX]) { - if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) - return -EINVAL; - filter_idx = nla_get_u32(tb[NDA_IFINDEX]); - } - if (tb[NDA_MASTER]) { - if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) - return -EINVAL; - filter_master_idx = nla_get_u32(tb[NDA_MASTER]); - } - if (filter_idx || filter_master_idx) - flags |= NLM_F_DUMP_FILTERED; - } + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); @@ -2368,8 +2409,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; - if (neigh_ifindex_filtered(n->dev, filter_idx) || - neigh_master_filtered(n->dev, filter_master_idx)) + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -2391,12 +2432,17 @@ out: } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int rc, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; + unsigned int flags = NLM_F_MULTI; + + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; read_lock_bh(&tbl->lock); @@ -2406,10 +2452,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { if (idx < s_idx || pneigh_net(n) != net) goto next; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, tbl) < 0) { + RTM_NEWNEIGH, flags, tbl) < 0) { read_unlock_bh(&tbl->lock); rc = -1; goto out; @@ -2428,22 +2476,91 @@ out: } +static int neigh_valid_dump_req(const struct nlmsghdr *nlh, + bool strict_check, + struct neigh_dump_filter *filter, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + int err, i; + + if (strict_check) { + struct ndmsg *ndm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || + ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } + if (err < 0) + return err; + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + /* all new attributes should require strict_check */ + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); + return -EINVAL; + } + filter->dev_idx = nla_get_u32(tb[i]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); + return -EINVAL; + } + filter->master_idx = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ - if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && - ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) + if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; + err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); + if (err < 0 && cb->strict_check) + return err; + s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { @@ -2457,9 +2574,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) - err = pneigh_dump_table(tbl, skb, cb); + err = pneigh_dump_table(tbl, skb, cb, &filter); else - err = neigh_dump_table(tbl, skb, cb); + err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 670c84b1bfc2..fefe72774aeb 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -853,6 +853,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .s_idx = cb->args[0], }; + if (cb->strict_check && + nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { + NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); + return -EINVAL; + } + spin_lock_bh(&net->nsid_lock); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); spin_unlock_bh(&net->nsid_lock); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 5e4f04004a49..7bf833598615 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -106,6 +106,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, iterate_fd(p->files, 0, update_classid_sock, (void *)(unsigned long)cs->classid); task_unlock(p); + cond_resched(); } css_task_iter_end(&it); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3219a2932463..2b9fdbc43205 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -57,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu); MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void netpoll_async_cleanup(struct work_struct *work); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -135,27 +134,9 @@ static void queue_process(struct work_struct *work) } } -/* - * Check whether delayed processing was scheduled for our NIC. If so, - * we attempt to grab the poll lock and use ->poll() to pump the card. - * If this fails, either we've recursed in ->poll() or it's already - * running on another CPU. - * - * Note: we don't mask interrupts with this lock because we're using - * trylock here and interrupts are already disabled in the softirq - * case. Further, we test the poll_owner to avoid recursion on UP - * systems where the lock doesn't exist. - */ static void poll_one_napi(struct napi_struct *napi) { - int work = 0; - - /* net_rx_action's ->poll() invocations and our's are - * synchronized by this test which is only made while - * holding the napi->poll_lock. - */ - if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return; + int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need @@ -607,7 +588,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strlcpy(np->dev_name, ndev->name, IFNAMSIZ); - INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", @@ -737,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; @@ -806,10 +787,6 @@ void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; - /* rtnl_dereference would be preferable here but - * rcu_cleanup_netpoll path can put us in here safely without - * holding the rtnl, so plain rcu_dereference it is - */ npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; @@ -830,21 +807,16 @@ void __netpoll_cleanup(struct netpoll *np) } EXPORT_SYMBOL_GPL(__netpoll_cleanup); -static void netpoll_async_cleanup(struct work_struct *work) +void __netpoll_free(struct netpoll *np) { - struct netpoll *np = container_of(work, struct netpoll, cleanup_work); + ASSERT_RTNL(); - rtnl_lock(); + /* Wait for transmitting packets to finish before freeing. */ + synchronize_rcu_bh(); __netpoll_cleanup(np); - rtnl_unlock(); kfree(np); } - -void __netpoll_free_async(struct netpoll *np) -{ - schedule_work(&np->cleanup_work); -} -EXPORT_SYMBOL_GPL(__netpoll_free_async); +EXPORT_SYMBOL_GPL(__netpoll_free); void netpoll_cleanup(struct netpoll *np) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7f6938405fa1..6ac919847ce6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3426,7 +3426,7 @@ xmit_more: net_info_ratelimited("%s xmit error: %d\n", pkt_dev->odevname, ret); pkt_dev->errors++; - /* fallthru */ + /* fall through */ case NETDEV_TX_BUSY: /* Retry it next time */ refcount_dec(&(pkt_dev->skb->users)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 63ce2283a456..33d9227a8b80 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include #include -#define RTNL_MAX_TYPE 48 +#define RTNL_MAX_TYPE 49 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -130,6 +130,12 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { @@ -1016,7 +1022,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ - + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ @@ -1598,7 +1604,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; - if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid)) + if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || @@ -1737,7 +1743,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, - [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, @@ -1845,7 +1851,15 @@ static bool link_dump_filtered(struct net_device *dev, return false; } -static struct net *get_target_net(struct sock *sk, int netnsid) +/** + * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. + * @sk: netlink socket + * @netnsid: network namespace identifier + * + * Returns the network namespace identified by netnsid on success or an error + * pointer on failure. + */ +struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; @@ -1862,9 +1876,54 @@ static struct net *get_target_net(struct sock *sk, int netnsid) } return net; } +EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); + +static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, + bool strict_check, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int hdrlen; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); + return -EINVAL; + } + if (ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); + return -EINVAL; + } + + return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + } + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); +} static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; @@ -1877,46 +1936,54 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; - int err; - int hdrlen; + int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; - /* A hack to preserve kernel<->userspace interface. - * The correct header is ifinfomsg. It is consistent with rtnl_getlink. - * However, before Linux v3.9 the code here assumed rtgenmsg and that's - * what iproute2 < v3.9.0 used. - * We can detect the old iproute2. Even including the IFLA_EXT_MASK - * attribute, its netlink message is shorter than struct ifinfomsg. - */ - hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? - sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); + if (err < 0) { + if (cb->strict_check) + return err; + + goto walk_entries; + } + + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; - if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, - ifla_policy, NULL) >= 0) { - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb->sk, netnsid); + /* new attributes should only be added with strict checking */ + switch (i) { + case IFLA_TARGET_NETNSID: + netnsid = nla_get_s32(tb[i]); + tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { - tgt_net = net; - netnsid = -1; + NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); + return PTR_ERR(tgt_net); + } + break; + case IFLA_EXT_MASK: + ext_filter_mask = nla_get_u32(tb[i]); + break; + case IFLA_MASTER: + master_idx = nla_get_u32(tb[i]); + break; + case IFLA_LINKINFO: + kind_ops = linkinfo_to_kind_ops(tb[i]); + break; + default: + if (cb->strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); + return -EINVAL; } } - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - - if (tb[IFLA_MASTER]) - master_idx = nla_get_u32(tb[IFLA_MASTER]); - - if (tb[IFLA_LINKINFO]) - kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); - - if (master_idx || kind_ops) - flags |= NLM_F_DUMP_FILTERED; } + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; + +walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; @@ -1928,8 +1995,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, + nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid); @@ -1984,7 +2050,7 @@ EXPORT_SYMBOL(rtnl_link_get_net); * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD - * 3. IFLA_IF_NETNSID + * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) @@ -1994,10 +2060,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); - if (!tb[IFLA_IF_NETNSID]) + if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); - net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); @@ -2038,13 +2104,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[], return -EOPNOTSUPP; } - if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) + if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; - if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD])) + if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; - if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID])) + if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; @@ -2320,7 +2386,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) return err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { @@ -2763,9 +2829,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } @@ -2837,6 +2903,12 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); + if (num_tx_queues < 1 || num_tx_queues > 4096) + return ERR_PTR(-EINVAL); + + if (num_rx_queues < 1 || num_rx_queues > 4096) + return ERR_PTR(-EINVAL); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) @@ -3173,9 +3245,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } @@ -3260,13 +3332,14 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; + int type = cb->nlh->nlmsg_type - RTM_BASE; + int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link **tab; - int type = cb->nlh->nlmsg_type-RTM_BASE; struct rtnl_link *link; rtnl_dumpit_func dumpit; @@ -3293,12 +3366,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->prev_seq = 0; cb->seq = 0; } - if (dumpit(skb, cb)) + ret = dumpit(skb, cb); + if (ret) break; } cb->family = idx; - return skb->len; + return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, @@ -3526,6 +3600,11 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); @@ -3630,6 +3709,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); @@ -3727,14 +3811,100 @@ out: } EXPORT_SYMBOL(ndo_dflt_fdb_dump); +static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + if (err < 0) + return err; + + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); + return -EINVAL; + } + *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); + return -EINVAL; + } + *br_idx = nla_get_u32(tb[NDA_MASTER]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err; + + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(nlh) != sizeof(struct ndmsg) && + (nlmsg_len(nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + struct ifinfomsg *ifm; + + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy, extack); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + *br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + ifm = nlmsg_data(nlh); + *brport_idx = ifm->ifi_index; + } + return 0; +} + static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; - struct nlattr *tb[IFLA_MAX+1]; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; - struct ifinfomsg *ifm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); struct hlist_head *head; int brport_idx = 0; @@ -3744,16 +3914,14 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); - if (err < 0) { - return -EINVAL; - } else if (err == 0) { - if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); - } - - brport_idx = ifm->ifi_index; + if (cb->strict_check) + err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, + cb->extack); + else + err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, + cb->extack); + if (err < 0) + return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); @@ -3938,28 +4106,72 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); +static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, + bool strict_check, u32 *filter_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err, i; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } + if (err < 0) + return err; + + /* new attributes should only be added with strict checking */ + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_EXT_MASK: + *filter_mask = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; - u32 seq = cb->nlh->nlmsg_seq; + u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; - if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { - struct nlattr *extfilt; - - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), - IFLA_EXT_MASK); - if (extfilt) { - if (nla_len(extfilt) < sizeof(filter_mask)) - return -EINVAL; - - filter_mask = nla_get_u32(extfilt); - } - } + err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, + cb->extack); + if (err < 0 && cb->strict_check) + return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -4553,6 +4765,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -4569,13 +4782,32 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; - if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) + if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; + } ifsm = nlmsg_data(cb->nlh); + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (cb->strict_check) { + if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) { + NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); + return -EINVAL; + } + } + filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; + } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2c807f67aba..b4ee5c8b928f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1846,8 +1846,9 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) if (skb->ip_summed == CHECKSUM_COMPLETE) { int delta = skb->len - len; - skb->csum = csum_sub(skb->csum, - skb_checksum(skb, len, delta, 0)); + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); } return __pskb_trim(skb, len); } @@ -3381,64 +3382,6 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, } EXPORT_SYMBOL(skb_find_text); -/** - * skb_append_datato_frags - append the user data to a skb - * @sk: sock structure - * @skb: skb structure to be appended with user data. - * @getfrag: call back function to be used for getting the user data - * @from: pointer to user message iov - * @length: length of the iov message - * - * Description: This procedure append the user data in the fragment part - * of the skb if any page alloc fails user this procedure returns -ENOMEM - */ -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int (*getfrag)(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb), - void *from, int length) -{ - int frg_cnt = skb_shinfo(skb)->nr_frags; - int copy; - int offset = 0; - int ret; - struct page_frag *pfrag = ¤t->task_frag; - - do { - /* Return error if we don't have space for new frag */ - if (frg_cnt >= MAX_SKB_FRAGS) - return -EMSGSIZE; - - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; - - /* copy the user data to page */ - copy = min_t(int, length, pfrag->size - pfrag->offset); - - ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, - offset, copy, 0, skb); - if (ret < 0) - return -EFAULT; - - /* copy was successful so update the size parameters */ - skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, - copy); - frg_cnt++; - pfrag->offset += copy; - get_page(pfrag->page); - - skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); - skb->len += copy; - skb->data_len += copy; - offset += copy; - length -= copy; - - } while (length > 0); - - return 0; -} -EXPORT_SYMBOL(skb_append_datato_frags); - int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size) { @@ -4452,14 +4395,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb_headlen(skb)) || - unlikely((int)start + off > skb_headlen(skb) - 2)) { - net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", - start, off, skb_headlen(skb)); + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", + start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; + skb->csum_start = csum_start; skb->csum_offset = off; skb_set_transport_header(skb, start); return true; @@ -4999,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000000..56a99d0c9aa0 --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,802 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include + +#include +#include + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) + return -ENOMEM; + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + u32 sge_len, sge_off; + + if (sk_msg_full(dst)) + return -ENOSPC; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + sge_off = sge->offset + off; + if (sge_len > len) + sge_len = len; + off = 0; + len -= sge_len; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + if (charge) + sk_mem_uncharge(sk, len); + if (!msg->skb) + put_page(sg_page(sge)); + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + if (msg->skb) + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); +out: + /* If we trim data before curr pointer update copybreak and current + * so that any future copy operations start at new copy location. + * However trimed data that has not yet been used in a copy op + * does not require an update. + */ + if (msg->sg.curr >= i) { + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } + sk_msg_iter_var_next(i); + msg->sg.end = i; +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + int copied = 0, num_sge; + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!msg)) + return -EAGAIN; + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(msg); + return -EAGAIN; + } + + sk_msg_init(msg); + num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + if (unlikely(num_sge < 0)) { + kfree(msg); + return num_sge; + } + + sk_mem_charge(sk, skb->len); + copied = skb->len; + msg->sg.start = 0; + msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->skb = skb; + + sk_psock_queue_msg(psock, msg); + sk->sk_data_ready(sk); + return copied; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + if (ingress) + return sk_psock_skb_ingress(psock, skb); + else + return skb_send_sock_locked(psock->sk, skb, off, len); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb; + bool ingress; + u32 len, off; + int ret; + + /* Lock sock to avoid losing sk_socket during loop. */ + lock_sock(psock->sk); + if (state->skb) { + skb = state->skb; + len = state->len; + off = state->off; + state->skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->ingress_skb))) { + len = skb->len; + off = 0; +start: + ingress = tcp_skb_bpf_ingress(skb); + do { + ret = -EIO; + if (likely(psock->sk->sk_socket)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + state->skb = skb; + state->len = len; + state->off = off; + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + kfree_skb(skb); + goto end; + } + off += ret; + len -= ret; + } while (len); + + if (!ingress) + kfree_skb(skb); + } +end: + release_sock(psock->sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock = kzalloc_node(sizeof(*psock), + GFP_ATOMIC | __GFP_NOWARN, + node); + if (!psock) + return NULL; + + psock->sk = sk; + psock->eval = __SK_NONE; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_WORK(&psock->work, sk_psock_backlog); + INIT_LIST_HEAD(&psock->ingress_msg); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + rcu_assign_sk_user_data(sk, psock); + sock_hold(sk); + + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void sk_psock_zap_ingress(struct sk_psock *psock) +{ + __skb_queue_purge(&psock->ingress_skb); + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +static void sk_psock_destroy_deferred(struct work_struct *gc) +{ + struct sk_psock *psock = container_of(gc, struct sk_psock, gc); + + /* No sk_callback_lock since already detached. */ + if (psock->parser.enabled) + strp_done(&psock->parser.strp); + + cancel_work_sync(&psock->work); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_destroy(struct rcu_head *rcu) +{ + struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); + + INIT_WORK(&psock->gc, sk_psock_destroy_deferred); + schedule_work(&psock->gc); +} +EXPORT_SYMBOL_GPL(sk_psock_destroy); + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + rcu_assign_sk_user_data(sk, NULL); + sk_psock_cork_free(psock); + sk_psock_restore_proto(sk, psock); + + write_lock_bh(&sk->sk_callback_lock); + if (psock->progs.skb_parser) + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + + call_rcu_sched(&psock->rcu, sk_psock_destroy); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + preempt_disable(); + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = BPF_PROG_RUN(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = msg->sk_redir; + if (!psock->sk_redir) { + ret = __SK_DROP; + goto out; + } + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, + struct sk_buff *skb) +{ + int ret; + + skb->sk = psock->sk; + bpf_compute_data_end_sk_skb(skb); + preempt_disable(); + ret = BPF_PROG_RUN(prog, skb); + preempt_enable(); + /* strparser clones the skb before handing it to a upper layer, + * meaning skb_orphan has been called. We NULL sk on the way out + * to ensure we don't trigger a BUG_ON() in skb/sk operations + * later and because we are not charging the memory of this skb + * to any socket yet. + */ + skb->sk = NULL; + return ret; +} + +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + bool ingress; + + switch (verdict) { + case __SK_REDIRECT: + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) + goto out_free; + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) + goto out_free; + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + break; + } + /* fall-through */ + case __SK_DROP: + /* fall-through */ + default: +out_free: + kfree_skb(skb); + } +} + +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = __SK_DROP; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_orphan(skb); + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_verdict_apply(psock, skb, ret); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_parser); + if (likely(prog)) + ret = sk_psock_bpf_run(psock, prog, skb); + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + rcu_read_unlock(); + write_space(sk); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} diff --git a/net/core/sock.c b/net/core/sock.c index 3730eb855095..080a880a1761 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -998,7 +998,7 @@ set_rcvbuf: cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = val; + sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; @@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, #endif case SO_MAX_PACING_RATE: - v.val = sk->sk_max_pacing_rate; + /* 32bit version */ + v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); break; case SO_INCOMING_CPU: @@ -2238,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2317,7 +2257,7 @@ static void __lock_sock(struct sock *sk) finish_wait(&sk->sk_lock.wq, &wait); } -static void __release_sock(struct sock *sk) +void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -2332,7 +2272,7 @@ static void __release_sock(struct sock *sk) next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); - skb->next = NULL; + skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); @@ -2810,8 +2750,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_ll_usec = sysctl_net_busy_read; #endif - sk->sk_max_pacing_rate = ~0U; - sk->sk_pacing_rate = ~0U; + sk->sk_max_pacing_rate = ~0UL; + sk->sk_pacing_rate = ~0UL; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; @@ -3339,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000000..be6092ac69f8 --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1003 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + raw_spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + u64 cost; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); + + /* Make sure page count doesn't overflow. */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_stab; + } + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sks = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (stab->sks) + return &stab->map; + err = -ENOMEM; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + u32 ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int ret; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + ret = sock_map_prog_update(map, prog, attr->attach_type); + fdput(f); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + preempt_disable(); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + preempt_enable(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + struct sk_psock_link *link, *tmp; + bool strp_stop = false; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct bpf_stab *stab = container_of(map, struct bpf_stab, + map); + if (psock->parser.enabled && stab->progs.skb_parser) + strp_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop) { + write_lock_bh(&sk->sk_callback_lock); + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, + struct sock *sk) +{ + struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + bool skb_progs, sk_psock_is_new = false; + struct sk_psock *psock; + int ret; + + skb_verdict = READ_ONCE(progs->skb_verdict); + skb_parser = READ_ONCE(progs->skb_parser); + skb_progs = skb_parser && skb_verdict; + if (skb_progs) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) + return PTR_ERR(skb_verdict); + skb_parser = bpf_prog_inc_not_zero(skb_parser); + if (IS_ERR(skb_parser)) { + bpf_prog_put(skb_verdict); + return PTR_ERR(skb_parser); + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out; + } + } + + psock = sk_psock_get_checked(sk); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + + if (psock) { + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) { + ret = -ENOMEM; + goto out_progs; + } + sk_psock_is_new = true; + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (sk_psock_is_new) { + ret = tcp_bpf_init(sk); + if (ret < 0) + goto out_drop; + } else { + tcp_bpf_reinit(sk); + } + + write_lock_bh(&sk->sk_callback_lock); + if (skb_progs && !psock->parser.enabled) { + ret = sk_psock_init_strp(sk, psock); + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + goto out_drop; + } + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + psock_set_prog(&psock->progs.skb_parser, skb_parser); + sk_psock_start_strp(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_drop: + sk_psock_put(sk, psock); +out_progs: + if (msg_parser) + bpf_prog_put(msg_parser); +out: + if (skb_progs) { + bpf_prog_put(skb_verdict); + bpf_prog_put(skb_parser); + } + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) + sock_map_unref(sk, psk); + } + raw_spin_unlock_bh(&stab->lock); + rcu_read_unlock(); + + bpf_map_area_free(stab->sks); + kfree(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk; + + raw_spin_lock_bh(&stab->lock); + sk = *psk; + if (!sk_test || sk_test == sk) + *psk = NULL; + raw_spin_unlock_bh(&stab->lock); + if (unlikely(!sk)) + return -EINVAL; + sock_map_unref(sk, psk); + return 0; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &stab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + raw_spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + u32 idx = *(u32 *)key; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_map_update_common(map, idx, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, +}; + +struct bpf_htab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[0]; +}; + +struct bpf_htab_bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bpf_htab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_htab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_htab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_htab *htab, + struct bpf_htab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem_probe, *elem = link_raw; + struct bpf_htab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + raw_spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 hash, key_size = map->key_size; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + raw_spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_htab_elem *old) +{ + struct bpf_htab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_elem *elem, *elem_new; + struct bpf_htab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &htab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_htab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + (u64) htab->elem_size * htab->map.max_entries; + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_htab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + struct hlist_node *node; + int i; + + synchronize_rcu(); + rcu_read_lock(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry_safe(elem, node, &bucket->head, node) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + } + raw_spin_unlock_bh(&bucket->lock); + } + rcu_read_unlock(); + + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_htab, map)->progs; + default: + break; + } + + return NULL; +} + +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + psock_set_prog(&progs->msg_parser, prog); + break; + case BPF_SK_SKB_STREAM_PARSER: + psock_set_prog(&progs->skb_parser, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + psock_set_prog(&progs->skb_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b1a2c5e38530..37b4667128a3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -279,7 +279,6 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, return ret; } -# ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -290,7 +289,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -# endif #endif static struct ctl_table net_core_table[] = { @@ -397,6 +395,14 @@ static struct ctl_table net_core_table[] = { .extra2 = &one, }, # endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &one, + }, #endif { .procname = "netdev_tstamp_prequeue", diff --git a/net/core/xdp.c b/net/core/xdp.c index 89b6785cef2a..4b2b194f4f1f 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -94,11 +94,21 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } -static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; int id = xdp_rxq->mem.id; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && + xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { + return; + } + if (id == 0) return; @@ -110,6 +120,7 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) mutex_unlock(&mem_id_lock); } +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { @@ -119,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq_info_unreg_mem_model(xdp_rxq); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; @@ -398,3 +409,41 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, info->flags = bpf->flags; } EXPORT_SYMBOL_GPL(xdp_attachment_setup); + +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) +{ + unsigned int metasize, totsize; + void *addr, *data_to_copy; + struct xdp_frame *xdpf; + struct page *page; + + /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ + metasize = xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; + totsize = xdp->data_end - xdp->data + metasize; + + if (sizeof(*xdpf) + totsize > PAGE_SIZE) + return NULL; + + page = dev_alloc_page(); + if (!page) + return NULL; + + addr = page_to_virt(page); + xdpf = addr; + memset(xdpf, 0, sizeof(*xdpf)); + + addr += sizeof(*xdpf); + data_to_copy = metasize ? xdp->data_meta : xdp->data; + memcpy(addr, data_to_copy, totsize); + + xdpf->data = addr + metasize; + xdpf->len = totsize - metasize; + xdpf->headroom = 0; + xdpf->metasize = metasize; + xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + + xdp_return_buff(xdp); + return xdpf; +} +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); diff --git a/net/dccp/input.c b/net/dccp/input.c index d28d46bff6ab..85d6c879383d 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -606,11 +606,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..8e08cea6f178 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -493,9 +493,11 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 875858c8b059..43733accf58e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, __poll_t mask; struct sock *sk = sock->sk; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index bfd43e8f2c06..d0b3e69c6b39 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1363,7 +1363,7 @@ static int dn_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" " %04hu %03d %02x %-10s %-7s %-7s\n", - dev->name ? dev->name : "???", + dev->name, dn_type2asc(dn_db->parms.mode), 0, 0, dn_db->t3, dn_db->parms.t3, diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 7f4534828f6c..a65d553e730d 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache; /* * Preparse instantiation data for a dns_resolver key. * - * The data must be a NUL-terminated string, with the NUL char accounted in - * datalen. + * For normal hostname lookups, the data must be a NUL-terminated string, with + * the NUL char accounted in datalen. * * If the data contains a '#' characters, then we take the clause after each * one to be an option of the form 'key=value'. The actual data of interest is * the string leading up to the first '#'. For instance: * * "ip1,ip2,...#foo=bar" + * + * For server list requests, the data must begin with a NUL char and be + * followed by a byte indicating the version of the data format. Version 1 + * looks something like (note this is packed): + * + * u8 Non-string marker (ie. 0) + * u8 Content (DNS_PAYLOAD_IS_*) + * u8 Version (e.g. 1) + * u8 Source of server list + * u8 Lookup status of server list + * u8 Number of servers + * foreach-server { + * __le16 Name length + * __le16 Priority (as per SRV record, low first) + * __le16 Weight (as per SRV record, higher first) + * __le16 Port + * u8 Source of address list + * u8 Lookup status of address list + * u8 Protocol (DNS_SERVER_PROTOCOL_*) + * u8 Number of addresses + * char[] Name (not NUL-terminated) + * foreach-address { + * u8 Family (DNS_ADDRESS_IS_*) + * union { + * u8[4] ipv4_addr + * u8[16] ipv6_addr + * } + * } + * } + * */ static int dns_resolver_preparse(struct key_preparsed_payload *prep) { + const struct dns_payload_header *bin; struct user_key_payload *upayload; unsigned long derrno; int ret; int datalen = prep->datalen, result_len = 0; const char *data = prep->data, *end, *opt; + if (datalen <= 1 || !data) + return -EINVAL; + + if (data[0] == 0) { + /* It may be a server list. */ + if (datalen <= sizeof(*bin)) + return -EINVAL; + + bin = (const struct dns_payload_header *)data; + kenter("[%u,%u],%u", bin->content, bin->version, datalen); + if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) { + pr_warn_ratelimited( + "dns_resolver: Unsupported content type (%u)\n", + bin->content); + return -EINVAL; + } + + if (bin->version != 1) { + pr_warn_ratelimited( + "dns_resolver: Unsupported server list version (%u)\n", + bin->version); + return -EINVAL; + } + + result_len = datalen; + goto store_result; + } + kenter("'%*.*s',%u", datalen, datalen, data, datalen); - if (datalen <= 1 || !data || data[datalen - 1] != '\0') + if (!data || data[datalen - 1] != '\0') return -EINVAL; datalen--; @@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) return 0; } +store_result: kdebug("store result"); prep->quotalen = result_len; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 49da67034f29..76338c38738a 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen, if (_result) { ret = -ENOMEM; - *_result = kmalloc(len + 1, GFP_KERNEL); + *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL); if (!*_result) goto put; - - memcpy(*_result, upayload->data, len); - (*_result)[len] = '\0'; } if (_expiry) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 4183e4ba27a5..48c41918fb35 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -38,6 +38,9 @@ config NET_DSA_TAG_DSA config NET_DSA_TAG_EDSA bool +config NET_DSA_TAG_GSWIP + bool + config NET_DSA_TAG_KSZ bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 9e4d3536f977..6e721f7a2947 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -9,6 +9,7 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 9f3209ff7ffd..a69c1790bbfc 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -52,6 +52,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_EDSA [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_GSWIP + [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_KSZ [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, #endif @@ -70,6 +73,52 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { [DSA_TAG_PROTO_NONE] = &none_ops, }; +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) +{ + const char *protocol_name[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = "brcm", +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND + [DSA_TAG_PROTO_BRCM_PREPEND] = "brcm-prepend", +#endif +#ifdef CONFIG_NET_DSA_TAG_DSA + [DSA_TAG_PROTO_DSA] = "dsa", +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + [DSA_TAG_PROTO_EDSA] = "edsa", +#endif +#ifdef CONFIG_NET_DSA_TAG_GSWIP + [DSA_TAG_PROTO_GSWIP] = "gswip", +#endif +#ifdef CONFIG_NET_DSA_TAG_KSZ + [DSA_TAG_PROTO_KSZ] = "ksz", +#endif +#ifdef CONFIG_NET_DSA_TAG_LAN9303 + [DSA_TAG_PROTO_LAN9303] = "lan9303", +#endif +#ifdef CONFIG_NET_DSA_TAG_MTK + [DSA_TAG_PROTO_MTK] = "mtk", +#endif +#ifdef CONFIG_NET_DSA_TAG_QCA + [DSA_TAG_PROTO_QCA] = "qca", +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = "trailer", +#endif + [DSA_TAG_PROTO_NONE] = "none", + }; + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(protocol_name) != DSA_TAG_LAST); + + for (i = 0; i < ARRAY_SIZE(dsa_device_ops); i++) + if (ops == dsa_device_ops[i]) + return protocol_name[i]; + + return protocol_name[DSA_TAG_PROTO_NONE]; +}; + const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) { const struct dsa_device_ops *ops; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3964c6f7a7c0..9e4fd04ab53c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -86,6 +86,7 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); bool dsa_schedule_work(struct work_struct *work); +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); /* legacy.c */ #if IS_ENABLED(CONFIG_NET_DSA_LEGACY) @@ -205,6 +206,9 @@ extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ extern const struct dsa_device_ops edsa_netdev_ops; +/* tag_gswip.c */ +extern const struct dsa_device_ops gswip_netdev_ops; + /* tag_ksz.c */ extern const struct dsa_device_ops ksz_netdev_ops; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 42a7b85b84e1..cb42939db776 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -392,8 +392,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) } /* Drop our reference to the MDIO bus device */ - if (pd->chip[i].host_dev) - put_device(pd->chip[i].host_dev); + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -687,8 +686,7 @@ static void dsa_shutdown(struct platform_device *pdev) #ifdef CONFIG_PM_SLEEP static int dsa_suspend(struct device *d) { - struct platform_device *pdev = to_platform_device(d); - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + struct dsa_switch_tree *dst = dev_get_drvdata(d); int i, ret = 0; for (i = 0; i < dst->pd->nr_chips; i++) { @@ -703,8 +701,7 @@ static int dsa_suspend(struct device *d) static int dsa_resume(struct device *d) { - struct platform_device *pdev = to_platform_device(d); - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + struct dsa_switch_tree *dst = dev_get_drvdata(d); int i, ret = 0; for (i = 0; i < dst->pd->nr_chips; i++) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1c45c1d6d241..7d0c19e7edcf 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -722,7 +722,7 @@ static void dsa_slave_netpoll_cleanup(struct net_device *dev) p->netpoll = NULL; - __netpoll_free_async(netpoll); + __netpoll_free(netpoll); } static void dsa_slave_poll_controller(struct net_device *dev) @@ -1058,6 +1058,27 @@ static struct device_type dsa_type = { .name = "dsa", }; +static ssize_t tagging_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct dsa_port *dp = dsa_slave_to_port(dev); + + return sprintf(buf, "%s\n", + dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops)); +} +static DEVICE_ATTR_RO(tagging); + +static struct attribute *dsa_slave_attrs[] = { + &dev_attr_tagging.attr, + NULL +}; + +static const struct attribute_group dsa_group = { + .name = "dsa", + .attrs = dsa_slave_attrs, +}; + static void dsa_slave_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) @@ -1353,8 +1374,14 @@ int dsa_slave_create(struct dsa_port *port) goto out_phy; } + ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group); + if (ret) + goto out_unreg; + return 0; +out_unreg: + unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); @@ -1378,6 +1405,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) rtnl_unlock(); dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); + sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group); unregister_netdev(slave_dev); phylink_destroy(dp->pl); free_percpu(p->stats64); @@ -1450,6 +1478,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) netdev_dbg(dev, "fdb add failed err=%d\n", err); break; } + fdb_info->offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev, &fdb_info->info); break; diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c new file mode 100644 index 000000000000..49e9b73f1be3 --- /dev/null +++ b/net/dsa/tag_gswip.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel / Lantiq GSWIP V2.0 PMAC tag support + * + * Copyright (C) 2017 - 2018 Hauke Mehrtens + */ + +#include +#include +#include +#include + +#include "dsa_priv.h" + +#define GSWIP_TX_HEADER_LEN 4 + +/* special tag in TX path header */ +/* Byte 0 */ +#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */ +#define GSWIP_TX_SLPID_CPU 2 +#define GSWIP_TX_SLPID_APP1 3 +#define GSWIP_TX_SLPID_APP2 4 +#define GSWIP_TX_SLPID_APP3 5 +#define GSWIP_TX_SLPID_APP4 6 +#define GSWIP_TX_SLPID_APP5 7 + +/* Byte 1 */ +#define GSWIP_TX_CRCGEN_DIS BIT(7) +#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */ +#define GSWIP_TX_DPID_ELAN 0 +#define GSWIP_TX_DPID_EWAN 1 +#define GSWIP_TX_DPID_CPU 2 +#define GSWIP_TX_DPID_APP1 3 +#define GSWIP_TX_DPID_APP2 4 +#define GSWIP_TX_DPID_APP3 5 +#define GSWIP_TX_DPID_APP4 6 +#define GSWIP_TX_DPID_APP5 7 + +/* Byte 2 */ +#define GSWIP_TX_PORT_MAP_EN BIT(7) +#define GSWIP_TX_PORT_MAP_SEL BIT(6) +#define GSWIP_TX_LRN_DIS BIT(5) +#define GSWIP_TX_CLASS_EN BIT(4) +#define GSWIP_TX_CLASS_SHIFT 0 +#define GSWIP_TX_CLASS_MASK GENMASK(3, 0) + +/* Byte 3 */ +#define GSWIP_TX_DPID_EN BIT(0) +#define GSWIP_TX_PORT_MAP_SHIFT 1 +#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1) + +#define GSWIP_RX_HEADER_LEN 8 + +/* special tag in RX path header */ +/* Byte 7 */ +#define GSWIP_RX_SPPID_SHIFT 4 +#define GSWIP_RX_SPPID_MASK GENMASK(6, 4) + +static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + int err; + u8 *gswip_tag; + + err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN); + if (err) + return NULL; + + skb_push(skb, GSWIP_TX_HEADER_LEN); + + gswip_tag = skb->data; + gswip_tag[0] = GSWIP_TX_SLPID_CPU; + gswip_tag[1] = GSWIP_TX_DPID_ELAN; + gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL; + gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK; + gswip_tag[3] |= GSWIP_TX_DPID_EN; + + return skb; +} + +static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + int port; + u8 *gswip_tag; + + if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN))) + return NULL; + + gswip_tag = skb->data - ETH_HLEN; + + /* Get source port information */ + port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; + + /* remove GSWIP tag */ + skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN); + + return skb; +} + +const struct dsa_device_ops gswip_netdev_ops = { + .xmit = gswip_tag_xmit, + .rcv = gswip_tag_rcv, +}; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index e7857a8ac86d..d14226ecfde4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -260,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, } sub_frag_mem_limit(fq->q.net, sum_truesize); - head->next = NULL; + skb_mark_not_on_list(head); head->dev = ldev; head->tstamp = fq->q.stamp; @@ -463,7 +463,6 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) table[0].data = &ieee802154_lowpan->frags.high_thresh; table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; - table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh; table[1].data = &ieee802154_lowpan->frags.low_thresh; table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7446b98661d8..58629314eae9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 4dd95cdd8070..c01fa791260d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -461,9 +461,9 @@ static int ah4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); + ipv4_redirect(skb, net, 0, IPPROTO_AH); xfrm_state_put(x); return 0; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index e90c89ef8c08..850a6f13a082 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); + if (!netif_carrier_ok(dev)) + neigh_carrier_down(&arp_tbl, dev); break; default: break; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 82178cc69c96..777fa3b7fb13 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer - * to the start of the CIPSO option on success, NULL if one if not found. + * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) @@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int optlen; int taglen; - for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { + for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { - case IPOPT_CIPSO: - return optptr; case IPOPT_END: return NULL; case IPOPT_NOOP: @@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) default: taglen = optptr[1]; } + if (!taglen || taglen > optlen) + return NULL; + if (optptr[0] == IPOPT_CIPSO) + return optptr; + optlen -= taglen; optptr += taglen; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ea4bd8a52422..a34602ae27de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -100,6 +100,16 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, + [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, +}; + +struct inet_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; }; #define IN4_ADDR_HSIZE_SHIFT 8 @@ -773,7 +783,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, } static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, - __u32 *pvalid_lft, __u32 *pprefered_lft) + __u32 *pvalid_lft, __u32 *pprefered_lft, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFA_MAX+1]; struct in_ifaddr *ifa; @@ -783,7 +794,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -888,7 +899,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); + ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -1584,13 +1595,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, } static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), + args->flags); if (!nlh) return -EMSGSIZE; @@ -1601,6 +1613,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + goto nla_put_failure; + if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { preferred = ifa->ifa_preferred_lft; valid = ifa->ifa_valid_lft; @@ -1645,27 +1661,142 @@ nla_put_failure: return -EMSGSIZE; } +static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, + struct inet_fill_args *fillargs, + struct net **tgt_net, struct sock *sk, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[IFA_MAX+1]; + struct ifaddrmsg *ifm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); + return -EINVAL; + } + + fillargs->ifindex = ifm->ifa_index; + if (fillargs->ifindex) { + cb->answer_flags |= NLM_F_DUMP_FILTERED; + fillargs->flags |= NLM_F_DUMP_FILTERED; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= IFA_MAX; ++i) { + if (!tb[i]) + continue; + + if (i == IFA_TARGET_NETNSID) { + struct net *net; + + fillargs->netnsid = nla_get_s32(tb[i]); + + net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); + if (IS_ERR(net)) { + fillargs->netnsid = -1; + NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); + return PTR_ERR(net); + } + *tgt_net = net; + } else { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int s_ip_idx, + struct inet_fill_args *fillargs) +{ + struct in_ifaddr *ifa; + int ip_idx = 0; + int err; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + + err = inet_fill_ifaddr(skb, ifa, fillargs); + if (err < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + } + err = 0; + +done: + cb->args[2] = ip_idx; + + return err; +} + static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct inet_fill_args fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = nlh->nlmsg_seq, + .event = RTM_NEWADDR, + .flags = NLM_F_MULTI, + .netnsid = -1, + }; struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; int h, s_h; int idx, s_idx; - int ip_idx, s_ip_idx; + int s_ip_idx; struct net_device *dev; struct in_device *in_dev; - struct in_ifaddr *ifa; struct hlist_head *head; + int err = 0; s_h = cb->args[0]; s_idx = idx = cb->args[1]; - s_ip_idx = ip_idx = cb->args[2]; + s_ip_idx = cb->args[2]; + + if (cb->strict_check) { + err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, + skb->sk, cb); + if (err < 0) + goto put_tgt_net; + + err = 0; + if (fillargs.ifindex) { + dev = __dev_get_by_index(tgt_net, fillargs.ifindex); + if (!dev) { + err = -ENODEV; + goto put_tgt_net; + } + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { + err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx, + &fillargs); + } + goto put_tgt_net; + } + } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; - head = &net->dev_index_head[h]; + head = &tgt_net->dev_index_head[h]; rcu_read_lock(); - cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ - net->dev_base_seq; + cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^ + tgt_net->dev_base_seq; hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; @@ -1675,18 +1806,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) if (!in_dev) goto cont; - for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; - ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - if (inet_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx, + &fillargs); + if (err < 0) { + rcu_read_unlock(); + goto done; } cont: idx++; @@ -1697,16 +1821,24 @@ cont: done: cb->args[0] = h; cb->args[1] = idx; - cb->args[2] = ip_idx; +put_tgt_net: + if (fillargs.netnsid >= 0) + put_net(tgt_net); - return skb->len; + return err < 0 ? err : skb->len; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { + struct inet_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, + .event = event, + .flags = 0, + .netnsid = -1, + }; struct sk_buff *skb; - u32 seq = nlh ? nlh->nlmsg_seq : 0; int err = -ENOBUFS; struct net *net; @@ -1715,7 +1847,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, if (!skb) goto errout; - err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); + err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1995,6 +2127,7 @@ errout: static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int h, s_h; int idx, s_idx; @@ -2002,6 +2135,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, struct in_device *in_dev; struct hlist_head *head; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -2021,7 +2169,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, if (inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -2038,7 +2186,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -2049,7 +2197,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 97689012b357..9e1c840596c5 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -683,12 +683,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err) */ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ip_esp_hdr *esph; struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int ivlen = crypto_aead_ivsize(aead); - int elen = skb->len - sizeof(*esph) - ivlen; + int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen; int nfrags; int assoclen; int seqhilen; @@ -698,13 +697,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct scatterlist *sg; int err = -EINVAL; - if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) goto out; if (elen <= 0) goto out; - assoclen = sizeof(*esph); + assoclen = sizeof(struct ip_esp_hdr); seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { @@ -820,9 +819,9 @@ static int esp4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_ESP); xfrm_state_put(x); return 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2998b0e47d4b..6df95be96311 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) +{ + bool dev_match = false; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int ret; + + for (ret = 0; ret < fi->fib_nhs; ret++) { + struct fib_nh *nh = &fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; + } + } +#else + if (fi->fib_nh[0].nh_dev == dev) + dev_match = true; +#endif + + return dev_match; +} +EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); - dev_match = false; - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - if (nh->nh_dev == dev) { - dev_match = true; - break; - } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif + dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; return ret; @@ -792,19 +802,115 @@ errout: return err; } +int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, + struct fib_dump_filter *filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[RTA_MAX + 1]; + struct rtmsg *rtm; + int err, i; + + ASSERT_RTNL(); + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); + return -EINVAL; + } + + rtm = nlmsg_data(nlh); + if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || + rtm->rtm_scope) { + NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); + return -EINVAL; + } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { + NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); + return -EINVAL; + } + + filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); + filter->flags = rtm->rtm_flags; + filter->protocol = rtm->rtm_protocol; + filter->rt_type = rtm->rtm_type; + filter->table_id = rtm->rtm_table; + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= RTA_MAX; ++i) { + int ifindex; + + if (!tb[i]) + continue; + + switch (i) { + case RTA_TABLE: + filter->table_id = nla_get_u32(tb[i]); + break; + case RTA_OIF: + ifindex = nla_get_u32(tb[i]); + filter->dev = __dev_get_by_index(net, ifindex); + if (!filter->dev) + return -ENODEV; + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + if (filter->flags || filter->protocol || filter->rt_type || + filter->table_id || filter->dev) { + filter->filter_set = 1; + cb->answer_flags = NLM_F_DUMP_FILTERED; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); + static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); + struct fib_dump_filter filter = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err; - if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && - ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) + if (cb->strict_check) { + err = ip_valid_fib_dump_req(net, nlh, &filter, cb); + if (err < 0) + return err; + } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { + struct rtmsg *rtm = nlmsg_data(nlh); + + filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); + } + + /* fib entries are never clones and ipv4 does not use prefix flag */ + if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED)) return skb->len; + if (filter.table_id) { + tb = fib_get_table(net, filter.table_id); + if (!tb) { + if (filter.dump_all_families) + return skb->len; + + NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); + return -ENOENT; + } + + err = fib_table_dump(tb, skb, cb, &filter); + return skb->len ? : err; + } + s_h = cb->args[0]; s_e = cb->args[1]; @@ -819,7 +925,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - err = fib_table_dump(tb, skb, cb); + err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) { if (likely(skb->len)) goto out; @@ -1243,7 +1349,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct netdev_notifier_changeupper_info *info; + struct netdev_notifier_changeupper_info *upper_info = ptr; + struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); unsigned int flags; @@ -1278,16 +1385,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); - /* fall through */ + rt_cache_flush(net); + break; case NETDEV_CHANGEMTU: + fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: - info = ptr; + upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ - if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + if (upper_info->upper_dev && + netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f3c89ccf14c5..b5c3937ca6ec 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - m = fi->fib_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); + ip_fib_metrics_put(fi->fib_metrics); + kfree(fi); } @@ -797,8 +795,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, return -EINVAL; } dev = __dev_get_by_index(net, nh->nh_oif); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; + } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); @@ -1018,13 +1018,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } -static int -fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) -{ - return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, - fi->fib_metrics->metrics); -} - struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { @@ -1082,16 +1075,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (unlikely(!fi->fib_metrics)) { - kfree(fi); - return ERR_PTR(err); - } - refcount_set(&fi->fib_metrics->refcnt, 1); - } else { - fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; + fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, + cfg->fc_mx_len); + if (unlikely(IS_ERR(fi->fib_metrics))) { + err = PTR_ERR(fi->fib_metrics); + kfree(fi); + return ERR_PTR(err); } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1110,10 +1101,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; } endfor_nexthops(fi) - err = fib_convert_metrics(fi, cfg); - if (err) - goto failure; - if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); @@ -1470,6 +1457,56 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, return NOTIFY_DONE; } +/* Update the PMTU of exceptions when: + * - the new MTU of the first hop becomes smaller than the PMTU + * - the old MTU was the same as the PMTU, and it limited discovery of + * larger MTUs on the path. With that limit raised, we can now + * discover larger MTUs + * A special case is locked exceptions, for which the PMTU is smaller + * than the minimal accepted PMTU: + * - if the new MTU is greater than the PMTU, don't make any change + * - otherwise, unlock and set PMTU + */ +static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) +{ + struct fnhe_hash_bucket *bucket; + int i; + + bucket = rcu_dereference_protected(nh->nh_exceptions, 1); + if (!bucket) + return; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); + fnhe; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { + if (fnhe->fnhe_mtu_locked) { + if (new <= fnhe->fnhe_pmtu) { + fnhe->fnhe_pmtu = new; + fnhe->fnhe_mtu_locked = false; + } + } else if (new < fnhe->fnhe_pmtu || + orig == fnhe->fnhe_pmtu) { + fnhe->fnhe_pmtu = new; + } + } + } +} + +void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct fib_nh *nh; + + hlist_for_each_entry(nh, head, nh_hash) { + if (nh->nh_dev == dev) + nh_update_mtu(nh, dev->mtu, orig_mtu); + } +} + /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5bc0c89e81e4..237c9f72b265 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb) } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) + struct sk_buff *skb, struct netlink_callback *cb, + struct fib_dump_filter *filter) { + unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); struct fib_alias *fa; int i, s_i; + if (filter->filter_set) + flags |= NLM_F_DUMP_FILTERED; + s_i = cb->args[4]; i = 0; @@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { int err; - if (i < s_i) { - i++; - continue; - } + if (i < s_i) + goto next; - if (tb->tb_id != fa->tb_id) { - i++; - continue; + if (tb->tb_id != fa->tb_id) + goto next; + + if (filter->filter_set) { + if (filter->rt_type && fa->fa_type != filter->rt_type) + goto next; + + if ((filter->protocol && + fa->fa_info->fib_protocol != filter->protocol)) + goto next; + + if (filter->dev && + !fib_info_nh_uses_dev(fa->fa_info, filter->dev)) + goto next; } err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, tb->tb_id, fa->fa_type, xkey, KEYLENGTH - fa->fa_slen, - fa->fa_tos, fa->fa_info, NLM_F_MULTI); + fa->fa_tos, fa->fa_info, flags); if (err < 0) { cb->args[4] = i; return err; } +next: i++; } @@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; @@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; - err = fn_trie_dump_leaf(l, tb, skb, cb); + err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index b798862b6be5..7efe740c06eb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { + if (!skb_checksum_simple_validate(skb)) { + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + } else if (csum_err) { *csum_err = true; return -EINVAL; } - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); options++; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 695979b7ef6d..d832beed6e3a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1098,9 +1098,9 @@ void icmp_err(struct sk_buff *skb, u32 info) } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) - ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_ICMP); } /* diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4da39446da2d..765b2b32c4a4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -111,13 +111,10 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) -#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) #define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) #define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) -#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 - #define IGMP_INITIAL_REPORT_DELAY (1) @@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; - if (ih3->qrv) - in_dev->mr_qrv = ih3->qrv; + + /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently + * received value was zero, use the default or statically + * configured value. + */ + in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + + /* RFC3376, 8.3. Query Response Interval: + * The number of seconds represented by the [Query Response + * Interval] must be less than the [Query Interval]. + */ + if (in_dev->mr_qri >= in_dev->mr_qi) + in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ @@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev) ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } -void ip_mc_init_dev(struct in_device *in_dev) -{ #ifdef CONFIG_IP_MULTICAST +static void ip_mc_reset(struct in_device *in_dev) +{ struct net *net = dev_net(in_dev->dev); + + in_dev->mr_qi = IGMP_QUERY_INTERVAL; + in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; +} +#else +static void ip_mc_reset(struct in_device *in_dev) +{ +} #endif + +void ip_mc_init_dev(struct in_device *in_dev) +{ ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif + ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } @@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); -#ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; -#endif + ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dfd5009f96ef..15e7f7915a21 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = ireq_opt_deref(ireq); + rcu_read_lock(); + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, @@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f5c9ef2586de..411dd7a90046 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7227128df2c..d6ee343fdb86 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -260,8 +260,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - if (head) - kfree_skb(head); + kfree_skb(head); ipq_put(qp); } @@ -382,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) - goto err; + goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { @@ -394,20 +393,20 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) - goto err; + goto discard_qp; qp->q.len = end; } } if (end == offset) - goto err; + goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) - goto err; + goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) - goto err; + goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; @@ -423,6 +422,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * We do the same here for IPv4 (and increment an snmp counter). */ + err = -EINVAL; /* Find out where to put this fragment. */ prev_tail = qp->q.fragments_tail; if (!prev_tail) @@ -431,7 +431,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ if (offset < prev_tail->ip_defrag_offset + prev_tail->len) - goto discard_qp; + goto overlap; if (offset == prev_tail->ip_defrag_offset + prev_tail->len) ip4_frag_append_to_last_run(&qp->q, skb); else @@ -450,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) FRAG_CB(skb1)->frag_run_len) rbn = &parent->rb_right; else /* Found an overlap with skb1. */ - goto discard_qp; + goto overlap; } while (*rbn); /* Here we have parent properly set, and rbn pointing to * one of its NULL left/right children. Insert skb. @@ -487,16 +487,18 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; + if (err) + inet_frag_kill(&qp->q); return err; } skb_dst_drop(skb); return -EINPROGRESS; +overlap: + __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); - err = -EINVAL; - __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); err: kfree_skb(skb); return err; @@ -621,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, sub_frag_mem_limit(qp->q.net, head->truesize); *nextp = NULL; - head->next = NULL; + skb_mark_not_on_list(head); head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; @@ -720,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) - return skb; - if (pskb_trim_rcsum(skb, netoff + len)) - return skb; + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { + kfree_skb(skb); + return NULL; + } + if (pskb_trim_rcsum(skb, netoff + len)) { + kfree_skb(skb); + return NULL; + } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; @@ -820,7 +826,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[0].extra1 = &net->ipv4.frags.low_thresh; - table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8cce0e9ea08c..38befe829caf 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -232,22 +232,19 @@ static void gre_err(struct sk_buff *skb, u32 info) const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; - bool csum_err = false; - if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), - iph->ihl * 4) < 0) { - if (!csum_err) /* ignore csum errors. */ - return; - } + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP), + iph->ihl * 4) < 0) + return; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - skb->dev->ifindex, 0, IPPROTO_GRE, 0); + skb->dev->ifindex, IPPROTO_GRE); return; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, - IPPROTO_GRE, 0); + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, + IPPROTO_GRE); return; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3196cf58f418..35a786c0aaa0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head) struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { - list_del(&skb->list); - /* Handle ip{6}_forward case, as sch_direct_xmit have - * another kind of SKB-list usage (see validate_xmit_skb_list) - */ - skb->next = NULL; + skb_list_del_init(skb); dst_input(skb); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9c4e72e9c60a..c09219e7f230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *nskb = segs->next; int err; - segs->next = NULL; + skb_mark_not_on_list(segs); err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) @@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb = frag; frag = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } if (err == 0) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..fffcc130900e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - const struct iphdr *iph = ip_hdr(skb); __be16 *ports; int end; @@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; + sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); @@ -1247,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); @@ -1560,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); @@ -1597,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f38cb21d773d..de31b302d69c 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -318,9 +318,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + ipv4_update_pmtu(skb, net, info, 0, protocol); else - ipv4_redirect(skb, net, 0, 0, protocol, 0); + ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index d97f4f2787f5..9119d012ba46 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -48,9 +48,9 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_COMP); xfrm_state_put(x); return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c891235b4966..e65287c27e3d 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -175,13 +175,12 @@ static int ipip_err(struct sk_buff *skb, u32 info) } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, net, info, t->parms.link, 0, - iph->protocol, 0); + ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0); + ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5660adcf7a04..a6defbec4f1b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2527,8 +2527,34 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { + struct fib_dump_filter filter = {}; + int err; + + if (cb->strict_check) { + err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, + &filter, cb); + if (err < 0) + return err; + } + + if (filter.table_id) { + struct mr_table *mrt; + + mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id); + if (!mrt) { + if (filter.dump_all_families) + return skb->len; + + NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); + return -ENOENT; + } + err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, + &mfc_unres_lock, &filter); + return skb->len ? : err; + } + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, - _ipmr_fill_mroute, &mfc_unres_lock); + _ipmr_fill_mroute, &mfc_unres_lock, &filter); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { @@ -2710,6 +2736,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) return true; } +static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); + return -EINVAL; + } + + return 0; +} + static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -2718,6 +2769,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct mr_table *mrt; + if (cb->strict_check) { + int err = ipmr_valid_dumplink(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_t = cb->args[0]; s_e = cb->args[1]; diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 1ad9aa62a97b..3e614cc824f7 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -268,6 +268,81 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, } EXPORT_SYMBOL(mr_fill_mroute); +static bool mr_mfc_uses_dev(const struct mr_table *mrt, + const struct mr_mfc *c, + const struct net_device *dev) +{ + int ct; + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + const struct vif_device *vif; + + vif = &mrt->vif_table[ct]; + if (vif->dev == dev) + return true; + } + } + return false; +} + +int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, + struct netlink_callback *cb, + int (*fill)(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock, struct fib_dump_filter *filter) +{ + unsigned int e = 0, s_e = cb->args[1]; + unsigned int flags = NLM_F_MULTI; + struct mr_mfc *mfc; + int err; + + if (filter->filter_set) + flags |= NLM_F_DUMP_FILTERED; + + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (filter->dev && + !mr_mfc_uses_dev(mrt, mfc, filter->dev)) + goto next_entry; + + err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); + if (err < 0) + goto out; +next_entry: + e++; + } + + spin_lock_bh(lock); + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (filter->dev && + !mr_mfc_uses_dev(mrt, mfc, filter->dev)) + goto next_entry2; + + err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); + if (err < 0) { + spin_unlock_bh(lock); + goto out; + } +next_entry2: + e++; + } + spin_unlock_bh(lock); + err = 0; + e = 0; + +out: + cb->args[1] = e; + return err; +} +EXPORT_SYMBOL(mr_table_dump); + int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), @@ -275,53 +350,35 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), - spinlock_t *lock) + spinlock_t *lock, struct fib_dump_filter *filter) { - unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1]; + unsigned int t = 0, s_t = cb->args[0]; struct net *net = sock_net(skb->sk); struct mr_table *mrt; - struct mr_mfc *mfc; + int err; + + /* multicast does not track protocol or have route type other + * than RTN_MULTICAST + */ + if (filter->filter_set) { + if (filter->protocol || filter->flags || + (filter->rt_type && filter->rt_type != RTN_MULTICAST)) + return skb->len; + } rcu_read_lock(); for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { if (t < s_t) goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { - if (e < s_e) - goto next_entry; - if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = 0; - s_e = 0; - - spin_lock_bh(lock); - list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) { - spin_unlock_bh(lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(lock); - e = 0; - s_e = 0; + + err = mr_table_dump(mrt, skb, cb, fill, lock, filter); + if (err < 0) + break; next_table: t++; } -done: rcu_read_unlock(); - cb->args[1] = e; cb->args[0] = t; return skb->len; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 04311f7067e2..6d218f5a2e71 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -5,8 +5,8 @@ #include #include -int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, - u32 *metrics) +static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, + int fc_mx_len, u32 *metrics) { bool ecn_ca = false; struct nlattr *nla; @@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, return 0; } -EXPORT_SYMBOL_GPL(ip_metrics_convert); + +struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, + int fc_mx_len) +{ + struct dst_metrics *fib_metrics; + int err; + + if (!fc_mx) + return (struct dst_metrics *)&dst_default_metrics; + + fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); + if (unlikely(!fib_metrics)) + return ERR_PTR(-ENOMEM); + + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + if (!err) { + refcount_set(&fib_metrics->refcnt, 1); + } else { + kfree(fib_metrics); + fib_metrics = ERR_PTR(err); + } + + return fib_metrics; +} +EXPORT_SYMBOL_GPL(ip_fib_metrics_init); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 12843c9ef142..0b10d8812828 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; - bool dev_match; int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) return false; } - dev_match = false; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - - if (nh->nh_dev == dev) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif - return dev_match || flags & XT_RPFILTER_LOOSE; + return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE; } static bool diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 6115bf1ff6f0..78a67f961d86 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -264,7 +264,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, return nf_nat_inet_fn(priv, skb, state); } -EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); static unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index ad3aeff152ed..a9d5e013e555 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -104,12 +104,26 @@ static int masq_device_event(struct notifier_block *this, return NOTIFY_DONE; } +static int inet_cmp(struct nf_conn *ct, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct nf_conntrack_tuple *tuple; + + if (!device_cmp(ct, (void *)(long)dev->ifindex)) + return 0; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + return ifa->ifa_address == tuple->dst.u3.ip; +} + static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct netdev_notifier_info info; + struct net *net = dev_net(idev->dev); /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have @@ -119,8 +133,10 @@ static int masq_inet_event(struct notifier_block *this, if (idev->dead) return NOTIFY_DONE; - netdev_notifier_info_init(&info, idev->dev); - return masq_device_event(this, event, &info); + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + + return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index ac110c1d55b5..a0aa13bcabda 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -60,6 +60,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris "); MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); MODULE_ALIAS("ip_nat_snmp_basic"); +MODULE_ALIAS_NFCT_HELPER("snmp_trap"); #define SNMP_PORT 161 #define SNMP_TRAP_PORT 162 diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index e50976e3c213..94eb25bc8d7e 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi4_iif = LOOPBACK_IFINDEX, }; const struct net_device *oif; - struct net_device *found; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int i; -#endif + const struct net_device *found; /* * Do not set flowi4_oif, it restricts results (for example, asking @@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (!oif) { found = FIB_RES_DEV(res); - goto ok; - } - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (i = 0; i < res.fi->fib_nhs; i++) { - struct fib_nh *nh = &res.fi->fib_nh[i]; + } else { + if (!fib_info_nh_uses_dev(res.fi, oif)) + return; - if (nh->nh_dev == oif) { - found = nh->nh_dev; - goto ok; - } + found = oif; } - return; -#else - found = FIB_RES_DEV(res); - if (found != oif) - return; -#endif -ok: + switch (priv->result) { case NFT_FIB_RESULT_OIF: *dest = found->ifindex; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33df4d76db2d..8ca3eb06ba04 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b678466da451..c0a9d26c06ce 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1001,21 +1001,22 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; if (ip_mtu_locked(dst)) return; - if (ipv4_mtu(dst) < mtu) + if (old_mtu < mtu) return; if (mtu < ip_rt_min_pmtu) { lock = true; - mtu = ip_rt_min_pmtu; + mtu = min(old_mtu, ip_rt_min_pmtu); } - if (rt->rt_pmtu == mtu && + if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) return; @@ -1040,17 +1041,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - - if (!mark) - mark = IP4_REPLY_MARK(net, skb->mark); + u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1132,14 +1131,14 @@ out: EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); @@ -1219,18 +1218,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) src = ip_hdr(skb)->saddr; else { struct fib_result res; - struct flowi4 fl4; - struct iphdr *iph; - - iph = ip_hdr(skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = iph->daddr; - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = rt->dst.dev->ifindex; - fl4.flowi4_iif = skb->dev->ifindex; - fl4.flowi4_mark = skb->mark; + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->dst.dev->ifindex, + .flowi4_iif = skb->dev->ifindex, + .flowi4_mark = skb->mark, + }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) @@ -1481,12 +1477,9 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *)dst; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt_del_uncached_list(rt); } @@ -1533,11 +1526,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); - if (fi->fib_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&fi->fib_metrics->refcnt); - } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -2785,7 +2775,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi4 fl4; + struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; @@ -2825,7 +2815,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; - memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c3387dfd725b..606f868d9f3f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); + return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..891ed2f91467 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; +static u32 u32_max_div_HZ = UINT_MAX / HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, - .maxlen = sizeof(int), + .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 10c6246396cc..9e6bc4d6daa7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,7 +262,7 @@ #include #include #include -#include +#include #include #include #include @@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) @@ -1295,7 +1295,7 @@ new_segment: copy = size_goal; /* All packets are restored as if they have - * already been sent. skb_mstamp isn't set to + * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) @@ -1753,6 +1753,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; + int inq; int ret; if (address & (PAGE_SIZE - 1) || address != zc->address) @@ -1773,12 +1774,15 @@ static int tcp_zerocopy_receive(struct sock *sk, tp = tcp_sk(sk); seq = tp->copied_seq; - zc->length = min_t(u32, zc->length, tcp_inq(sk)); + inq = tcp_inq(sk); + zc->length = min_t(u32, zc->length, inq); zc->length &= ~(PAGE_SIZE - 1); - - zap_page_range(vma, address, zc->length); - - zc->recv_skip_hint = 0; + if (zc->length) { + zap_page_range(vma, address, zc->length); + zc->recv_skip_hint = 0; + } else { + zc->recv_skip_hint = inq; + } ret = 0; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { @@ -1801,8 +1805,17 @@ static int tcp_zerocopy_receive(struct sock *sk, frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) + if (frags->size != PAGE_SIZE || frags->page_offset) { + int remaining = zc->recv_skip_hint; + + while (remaining && (frags->size != PAGE_SIZE || + frags->page_offset)) { + remaining -= frags->size; + frags++; + } + zc->recv_skip_hint -= remaining; break; + } ret = vm_insert_page(vma, address + length, skb_frag_page(frags)); if (ret) @@ -2403,16 +2416,10 @@ adjudge_to_death: sock_hold(sk); sock_orphan(sk); - /* It is the last release_sock in its life. It will remove backlog. */ - release_sock(sk); - - - /* Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ local_bh_disable(); bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); + /* remove backlog if any, without releasing ownership. */ + __release_sock(sk); percpu_counter_inc(sk->sk_prot->orphan_count); @@ -2481,6 +2488,7 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); + release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); @@ -2595,6 +2603,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->compressed_ack = 0; tp->bytes_sent = 0; tp->bytes_retrans = 0; + tp->duplicate_sack[0].start_seq = 0; + tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; @@ -3101,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long rate; u32 now; u64 rate64; bool slow; - u32 rate; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) @@ -3114,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; @@ -3244,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + unsigned long rate; u64 rate64; - u32 rate; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) @@ -3264,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); @@ -3894,8 +3904,8 @@ void __init tcp_init(void) init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_rmem[1] = 87380; - init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[1] = 131072; + init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 02ff2dde9609..9277abdd822a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +static const int bbr_pacing_margin_percent = 1; + /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting @@ -208,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { unsigned int mss = tcp_sk(sk)->mss_cache; - if (!tcp_needs_internal_pacing(sk)) - mss = tcp_mss_to_mtu(sk, mss); rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC; + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); return rate >> BW_SCALE; } /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) { u64 rate = bw; @@ -257,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); @@ -279,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); @@ -368,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -459,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain == BBR_UNIT) return is_full_length; /* just use wall clock time */ - inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); bw = bbr_max_bw(sk); /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at @@ -487,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk) bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); bbr->cycle_mstamp = tp->delivered_mstamp; - bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : - bbr_pacing_gain[bbr->cycle_idx]; } /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ @@ -506,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_STARTUP; - bbr->pacing_gain = bbr_high_gain; - bbr->cwnd_gain = bbr_high_gain; } static void bbr_reset_probe_bw_mode(struct sock *sk) @@ -515,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW; - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = bbr_cwnd_gain; bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ } @@ -734,13 +762,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ - bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ - bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ tcp_sk(sk)->snd_ssthresh = bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && - tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -797,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr_probe_rtt_mode_ms > 0 && filter_expired && !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = BBR_UNIT; bbr_save_cwnd(sk); /* note cwnd so we can restore it */ bbr->probe_rtt_done_stamp = 0; } @@ -826,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bbr->idle_restart = 0; } +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = (bbr->lt_use_bw ? + BBR_UNIT : + bbr_pacing_gain[bbr->cycle_idx]); + bbr->cwnd_gain = bbr_cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); @@ -833,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); bbr_update_min_rtt(sk, rs); + bbr_update_gains(sk); } static void bbr_main(struct sock *sk, const struct rate_sample *rs) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c new file mode 100644 index 000000000000..3b45fe530f91 --- /dev/null +++ b/net/ipv4/tcp_bpf.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include +#include +#include + +#include + +static bool tcp_bpf_stream_read(const struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} + +static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + struct msghdr *msg, int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + int i, ret, copied = 0; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); + + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + ret = copy_page_to_iter(page, sge->offset, copy, iter); + if (ret != copy) { + msg_rx->sg.start = i; + return -EFAULT; + } + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = list_next_entry(msg_rx, list); + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + list_del(&msg_rx->list); + if (msg_rx->skb) + consume_skb(msg_rx->skb); + kfree(msg_rx); + } + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); + } + + return copied; +} +EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); + +int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + lock_sock(sk); +msg_bytes_ready: + copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (skb_queue_empty(&sk->sk_receive_queue)) + goto msg_bytes_ready; + release_sock(sk); + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} + +static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, u32 apply_bytes, int flags) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + u32 size, copied = 0; + struct sk_msg *tmp; + int i, ret = 0; + + tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!tmp)) + return -ENOMEM; + + lock_sock(sk); + tmp->sg.start = msg->sg.start; + i = msg->sg.start; + do { + sge = sk_msg_elem(msg, i); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + ret = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + sk_msg_xfer(tmp, msg, i, size); + copied += size; + if (sge->length) + get_page(sk_msg_page(tmp, i)); + sk_msg_iter_var_next(i); + tmp->sg.end = i; + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != msg->sg.end); + + if (!ret) { + msg->sg.start = i; + msg->sg.size -= apply_bytes; + sk_psock_queue_msg(psock, tmp); + sk->sk_data_ready(sk); + } else { + sk_msg_free(sk, tmp); + kfree(tmp); + } + + release_sock(sk); + return ret; +} + +static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + struct page *page; + int size, ret = 0; + u32 off; + + while (1) { + sge = sk_msg_elem(msg, msg->sg.start); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + off = sge->offset; + page = sg_page(sge); + + tcp_rate_check_app_limited(sk); +retry: + ret = do_tcp_sendpages(sk, page, off, size, flags); + if (ret <= 0) + return ret; + if (apply) + apply_bytes -= ret; + msg->sg.size -= ret; + sge->offset += ret; + sge->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + if (ret != size) { + size -= ret; + off += ret; + goto retry; + } + if (!sge->length) { + put_page(page); + sk_msg_iter_next(msg, start); + sg_init_table(sge, 1); + if (msg->sg.start == msg->sg.end) + break; + } + if (apply && !apply_bytes) + break; + } + + return 0; +} + +static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, + u32 apply_bytes, int flags, bool uncharge) +{ + int ret; + + lock_sock(sk); + ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); + release_sock(sk); + return ret; +} + +int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, + u32 bytes, int flags) +{ + bool ingress = sk_msg_to_ingress(msg); + struct sk_psock *psock = sk_psock_get(sk); + int ret; + + if (unlikely(!psock)) { + sk_msg_free(sk, msg); + return 0; + } + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + tcp_bpf_push_locked(sk, msg, bytes, flags, false); + sk_psock_put(sk, psock); + return ret; +} +EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); + +static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, int *copied, int flags) +{ + bool cork = false, enospc = msg->sg.start == msg->sg.end; + struct sock *sk_redir; + u32 tosend; + int ret; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + + if (msg->cork_bytes && + msg->cork_bytes > msg->sg.size && !enospc) { + psock->cork_bytes = msg->cork_bytes - msg->sg.size; + if (!psock->cork) { + psock->cork = kzalloc(sizeof(*psock->cork), + GFP_ATOMIC | __GFP_NOWARN); + if (!psock->cork) + return -ENOMEM; + } + memcpy(psock->cork, msg, sizeof(*msg)); + return 0; + } + + tosend = msg->sg.size; + if (psock->apply_bytes && psock->apply_bytes < tosend) + tosend = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + ret = tcp_bpf_push(sk, msg, tosend, flags, true); + if (unlikely(ret)) { + *copied -= sk_msg_free(sk, msg); + break; + } + sk_msg_apply_bytes(psock, tosend); + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + sk_msg_apply_bytes(psock, tosend); + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + sk_msg_return(sk, msg, tosend); + release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + lock_sock(sk); + if (unlikely(ret < 0)) { + int free = sk_msg_free_nocharge(sk, msg); + + if (!cork) + *copied -= free; + } + if (cork) { + sk_msg_free(sk, msg); + kfree(msg); + msg = NULL; + ret = 0; + } + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, tosend); + sk_msg_apply_bytes(psock, tosend); + *copied -= tosend; + return -EACCES; + } + + if (likely(!ret)) { + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (msg && + msg->sg.data[msg->sg.start].page_link && + msg->sg.data[msg->sg.start].length) + goto more_data; + } + return ret; +} + +static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct sk_msg tmp, *msg_tx = NULL; + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + int copied = 0, err = 0; + struct sk_psock *psock; + long timeo; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendmsg(sk, msg, size); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + while (msg_data_left(msg)) { + bool enospc = false; + u32 copy, osize; + + if (sk->sk_err) { + err = -sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + if (psock->cork) { + msg_tx = psock->cork; + } else { + msg_tx = &tmp; + sk_msg_init(msg_tx); + } + + osize = msg_tx->sg.size; + err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = msg_tx->sg.size - osize; + } + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + copy); + if (err < 0) { + sk_msg_trim(sk, msg_tx, osize); + goto out_err; + } + + copied += copy; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) { + if (msg_tx && msg_tx != psock->cork) + sk_msg_free(sk, msg_tx); + goto out_err; + } + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct sk_msg tmp, *msg = NULL; + int err = 0, copied = 0; + struct sk_psock *psock; + bool enospc = false; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendpage(sk, page, offset, size, flags); + + lock_sock(sk); + if (psock->cork) { + msg = psock->cork; + } else { + msg = &tmp; + sk_msg_init(msg); + } + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(sk_msg_full(msg))) + goto out_err; + + sk_msg_page_add(msg, page, size, offset); + sk_mem_charge(sk, size); + copied = size; + if (sk_msg_full(msg)) + enospc = true; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); +out_err: + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + sk_psock_cork_free(psock); + __sk_psock_purge_ingress_msg(psock); + while ((link = sk_psock_link_pop(psock))) { + sk_psock_unlink(sk, link); + sk_psock_free_link(link); + } +} + +static void tcp_bpf_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + + saved_unhash = psock->saved_unhash; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + saved_unhash(sk); +} + +static void tcp_bpf_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + + saved_close = psock->saved_close; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + release_sock(sk); + saved_close(sk, timeout); +} + +enum { + TCP_BPF_IPV4, + TCP_BPF_IPV6, + TCP_BPF_NUM_PROTS, +}; + +enum { + TCP_BPF_BASE, + TCP_BPF_TX, + TCP_BPF_NUM_CFGS, +}; + +static struct proto *tcpv6_prot_saved __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; + +static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], + struct proto *base) +{ + prot[TCP_BPF_BASE] = *base; + prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; + prot[TCP_BPF_BASE].close = tcp_bpf_close; + prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + + prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; + prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; +} + +static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +{ + if (sk->sk_family == AF_INET6 && + unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(ops != tcpv6_prot_saved)) { + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); + smp_store_release(&tcpv6_prot_saved, ops); + } + spin_unlock_bh(&tcpv6_prot_lock); + } +} + +static int __init tcp_bpf_v4_build_proto(void) +{ + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); + return 0; +} +core_initcall(tcp_bpf_v4_build_proto); + +static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); +} + +static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed + * or added requiring sk_prot hook updates. We keep original saved + * hooks in this case. + */ + sk->sk_prot = &tcp_bpf_prots[family][config]; +} + +static int tcp_bpf_assert_proto_ops(struct proto *ops) +{ + /* In order to avoid retpoline, we make assumptions when we call + * into ops if e.g. a psock is not present. Make sure they are + * indeed valid assumptions. + */ + return ops->recvmsg == tcp_recvmsg && + ops->sendmsg == tcp_sendmsg && + ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; +} + +void tcp_bpf_reinit(struct sock *sk) +{ + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + tcp_bpf_reinit_sk_prot(sk, psock); + rcu_read_unlock(); +} + +int tcp_bpf_init(struct sock *sk) +{ + struct proto *ops = READ_ONCE(sk->sk_prot); + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock || psock->sk_proto || + tcp_bpf_assert_proto_ops(ops))) { + rcu_read_unlock(); + return -EINVAL; + } + tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_update_sk_prot(sk, psock); + rcu_read_unlock(); + return 0; +} diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 06fbe102a425..37eebd910396 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) return; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); + u32 now_us = tp->tcp_mstamp; if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index ca61e2a659e7..cd4814f7e962 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -44,6 +44,7 @@ #include #include #include +#include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U @@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -/* Minimal DCTP CE state machine: - * - * S: 0 <- last pkt was non-CE - * 1 <- last pkt was CE - */ - -static void dctcp_ce_state_0_to_1(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (!ca->ce_state) { - /* State has changed from CE=0 to CE=1, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 1; - - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; -} - -static void dctcp_ce_state_1_to_0(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (ca->ce_state) { - /* State has changed from CE=1 to CE=0, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 0; - - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; -} - static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); @@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state) static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { + struct dctcp *ca = inet_csk_ca(sk); + switch (ev) { case CA_EVENT_ECN_IS_CE: - dctcp_ce_state_0_to_1(sk); - break; case CA_EVENT_ECN_NO_CE: - dctcp_ce_state_1_to_0(sk); + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; default: /* Don't care for the rest. */ diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h new file mode 100644 index 000000000000..d69a77cbd0c7 --- /dev/null +++ b/net/ipv4/tcp_dctcp.h @@ -0,0 +1,40 @@ +#ifndef _TCP_DCTCP_H +#define _TCP_DCTCP_H + +static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (ce_state == 1) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + else + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ +static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + u32 *prior_rcv_nxt, u32 *ce_state) +{ + u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; + + if (*ce_state != new_ce_state) { + /* CE state has changed, force an immediate ACK to + * reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + dctcp_ece_ack_cwr(sk, *ce_state); + __tcp_send_ack(sk, *prior_rcv_nxt); + } + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } + *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; + *ce_state = new_ce_state; + dctcp_ece_ack_cwr(sk, new_ce_state); +} + +#endif diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf2f7bb2802..2868ef28ce52 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) } } -/* 3. Tuning rcvbuf, when connection enters established state. */ -static void tcp_fixup_rcvbuf(struct sock *sk) -{ - u32 mss = tcp_sk(sk)->advmss; - int rcvmem; - - rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * - tcp_default_init_rwnd(mss); - - /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency - * Allow enough cushion so that sender is not limited by our window - */ - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) - rcvmem <<= 2; - - if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); -} - -/* 4. Try to fixup all. It is made immediately after connection enters +/* 3. Try to fixup all. It is made immediately after connection enters * established state. */ void tcp_init_buffer_space(struct sock *sk) @@ -454,12 +435,10 @@ void tcp_init_buffer_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int maxwin; - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = tp->rcv_wnd; + tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } -/* 5. Recalculate window clamp after socket hit its memory bounds. */ +/* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1305,7 +1284,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1580,7 +1559,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); @@ -3000,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX, tcp_rtx_queue_head(sk)); } } @@ -3103,7 +3082,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { - last_ackt = skb->skb_mstamp; + last_ackt = tcp_skb_timestamp_us(skb); WARN_ON_ONCE(last_ackt == 0); if (!first_ackt) first_ackt = last_ackt; @@ -3121,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3215,7 +3194,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, + tcp_skb_timestamp_us(skb))) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3275,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk) } else { unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + when, TCP_RTO_MAX, NULL); } } @@ -4199,6 +4179,17 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } +static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) +{ + /* When the ACK path fails or drops most ACKs, the sender would + * timeout and spuriously retransmit the same segment repeatedly. + * The receiver remembers and reflects via DSACKs. Leverage the + * DSACK state and change the txhash to re-route speculatively. + */ + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + sk_rethink_txhash(sk); +} + static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4211,6 +4202,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_spurious_retrans(sk, skb); if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); @@ -4755,6 +4747,7 @@ queue_and_out: } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -6009,11 +6002,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->fin) goto discard; /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c09eddbb78..de47038afdf0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) BUG_ON(!skb); tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); @@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -2549,7 +2551,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 2; cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 597dbd749f05..9c34b97d365d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,21 @@ #include +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_ns(); + + if (val > tp->tcp_clock_cache) + tp->tcp_clock_cache = val; + + val = div_u64(val, NSEC_PER_USEC); + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -179,21 +194,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } - -u32 tcp_default_init_rwnd(u32 mss) -{ - /* Initial receive window should be twice of TCP_INIT_CWND to - * enable proper sending of new unsent data during fast recovery - * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a - * limit when mss is larger than 1460. - */ - u32 init_rwnd = TCP_INIT_CWND * 2; - - if (mss > 1460) - init_rwnd = max((1460 * init_rwnd) / mss, 2U); - return init_rwnd; -} - /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -228,7 +228,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = space; + (*rcv_wnd) = min_t(u32, space, U16_MAX); + + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); (*rcv_wscale) = 0; if (wscale_ok) { @@ -241,11 +244,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale)++; } } - - if (!init_rcv_wnd) /* Use default unless specified otherwise */ - init_rcv_wnd = tcp_default_init_rwnd(mss); - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } @@ -977,28 +975,28 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, + u64 prior_wstamp) { - u64 len_ns; - u32 rate; + struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_needs_internal_pacing(sk)) - return; - rate = sk->sk_pacing_rate; - if (!rate || rate == ~0U) - return; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + if (sk->sk_pacing_status != SK_PACING_NONE) { + unsigned long rate = sk->sk_pacing_rate; - len_ns = (u64)skb->len * NSEC_PER_SEC; - do_div(len_ns, rate); - hrtimer_start(&tcp_sk(sk)->pacing_timer, - ktime_add_ns(ktime_get(), len_ns), - HRTIMER_MODE_ABS_PINNED_SOFT); - sock_hold(sk); -} + /* Original sch_fq does not pace first 10 MSS + * Note that tp->data_segs_out overflows after 2^32 packets, + * this is a minor annoyance. + */ + if (rate != ~0UL && rate && tp->data_segs_out >= 10) { + u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); + u64 credit = tp->tcp_wstamp_ns - prior_wstamp; -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) -{ - skb->skb_mstamp = tp->tcp_mstamp; + /* take into account OS jitter */ + len_ns -= min_t(u64, len_ns / 2, credit); + tp->tcp_wstamp_ns += len_ns; + } + } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } @@ -1025,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; + u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -1045,7 +1044,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; } - skb->skb_mstamp = tp->tcp_mstamp; + + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1137,7 +1140,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; - tcp_internal_pacing(sk, skb); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1149,8 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); - /* Our usage of tstamp should remain private */ - skb->tstamp = 0; + /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), @@ -1163,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = net_xmit_eval(err); } if (!err && oskb) { - tcp_update_skb_after_send(tp, oskb); + tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1698,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, - sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + bytes = min_t(unsigned long, + sk->sk_pacing_rate >> sk->sk_pacing_shift, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, * not one big TSO packet every 100 ms. @@ -1966,7 +1968,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); + age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -2172,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } -static bool tcp_pacing_check(const struct sock *sk) +static bool tcp_pacing_check(struct sock *sk) { - return tcp_needs_internal_pacing(sk) && - hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_needs_internal_pacing(sk)) + return false; + + if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) + return false; + + if (!hrtimer_is_queued(&tp->pacing_timer)) { + hrtimer_start(&tp->pacing_timer, + ns_to_ktime(tp->tcp_wstamp_ns), + HRTIMER_MODE_ABS_PINNED_SOFT); + sock_hold(sk); + } + return true; } /* TCP Small Queues : @@ -2192,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk) static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { - unsigned int limit; + unsigned long limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(u32, limit, + limit = max_t(unsigned long, + 2 * skb->truesize, + sk->sk_pacing_rate >> sk->sk_pacing_shift); + limit = min_t(unsigned long, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; @@ -2304,18 +2321,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ + skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + goto repair; /* Skip network transmission */ + } + if (tcp_pacing_check(sk)) break; tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "skb_mstamp" is used as a start point for the retransmit timer */ - tcp_update_skb_after_send(tp, skb); - goto repair; /* Skip network transmission */ - } - cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { if (push_one == 2) @@ -2437,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX, NULL); return true; } @@ -2887,7 +2905,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } tcp_skb_tsorted_restore(skb); if (!err) { - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { @@ -3002,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX, + skb); } } @@ -3205,10 +3224,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif - skb->skb_mstamp = tcp_clock_us(); + skb->skb_mstamp_ns = tcp_clock_ns(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3424,7 +3443,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp = syn_data->skb_mstamp; + syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) @@ -3734,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_probes_out = 1; probe_max = TCP_RESOURCE_PROBE_INTERVAL; } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_when(sk, probe_max), - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_when(sk, probe_max), + TCP_RTO_MAX, + NULL); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 4dff40dad4dc..baed2186c7c6 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) * bandwidth estimate. */ if (!tp->packets_out) { - tp->first_tx_mstamp = skb->skb_mstamp; - tp->delivered_mstamp = skb->skb_mstamp; + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; } TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; @@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta( - skb->skb_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = skb->skb_mstamp; } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c81aadff769b..fdb715bdd2d1 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + if (!tcp_rack_sent_after(tp->rack.mstamp, + tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7fdf222a0bdf..676020663ce8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(skb); if (!start_ts) - skb->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp_ns = tp->tcp_clock_cache; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index a5995bb2eaca..95df7f7f6328 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -6,7 +6,7 @@ * */ -#include +#include #include #include #include @@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) return NULL; } -static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) -{ - struct tcp_ulp_ops *e; - - list_for_each_entry_rcu(e, &tcp_ulp_list, list) { - if (e->uid == ulp) - return e; - } - - return NULL; -} - static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; @@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) return ulp; } -static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) -{ - const struct tcp_ulp_ops *ulp; - - rcu_read_lock(); - ulp = tcp_ulp_find_id(uid); - if (!ulp || !try_module_get(ulp->owner)) - ulp = NULL; - rcu_read_unlock(); - return ulp; -} - /* Attach new upper layer protocol to the list * of available protocols. */ @@ -123,6 +99,10 @@ void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + /* No sock_owned_by_me() check here as at the time the + * stack calls this function, the socket is dead and + * about to be destroyed. + */ if (!icsk->icsk_ulp_ops) return; @@ -133,54 +113,35 @@ void tcp_cleanup_ulp(struct sock *sk) icsk->icsk_ulp_ops = NULL; } -/* Change upper layer protocol for socket */ -int tcp_set_ulp(struct sock *sk, const char *name) +static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_ulp_ops *ulp_ops; - int err = 0; + int err; + err = -EEXIST; if (icsk->icsk_ulp_ops) - return -EEXIST; - - ulp_ops = __tcp_ulp_find_autoload(name); - if (!ulp_ops) - return -ENOENT; - - if (!ulp_ops->user_visible) { - module_put(ulp_ops->owner); - return -ENOENT; - } + goto out_err; err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } + if (err) + goto out_err; icsk->icsk_ulp_ops = ulp_ops; return 0; +out_err: + module_put(ulp_ops->owner); + return err; } -int tcp_set_ulp_id(struct sock *sk, int ulp) +int tcp_set_ulp(struct sock *sk, const char *name) { - struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_ulp_ops *ulp_ops; - int err; - if (icsk->icsk_ulp_ops) - return -EEXIST; + sock_owned_by_me(sk); - ulp_ops = __tcp_ulp_lookup(ulp); + ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; - err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } - - icsk->icsk_ulp_ops = ulp_ops; - return 0; + return __tcp_set_ulp(sk, ulp_ops); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d69dd6fa7e8..1976fddb9e00 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -81,7 +81,7 @@ #include #include -#include +#include #include #include #include @@ -609,8 +609,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, 0, - udptable, NULL); + iph->saddr, uh->source, skb->dev->ifindex, + inet_sdif(skb), udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; @@ -1627,7 +1627,7 @@ busy_check: *err = error; return NULL; } -EXPORT_SYMBOL_GPL(__skb_recv_udp); +EXPORT_SYMBOL(__skb_recv_udp); /* * This should be easy, if there is something there we @@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { static_branch_enable(&udp_encap_needed_key); @@ -2120,8 +2120,24 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ - return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, - inet_compute_pseudo); + err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); + if (err) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { + /* If SW calculated the value, we know it's bad */ + if (skb->csum_complete_sw) + return 1; + + /* HW says the value is bad. Let's validate that. + * skb->csum is no longer the full packet checksum, + * so don't treat it as such. + */ + skb_checksum_complete_unset(skb); + } + + return 0; } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index d9ad986c7b2c..5cbb9be05295 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -42,6 +42,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, rcu_read_lock(); if (req->sdiag_family == AF_INET) + /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0c0522b79b43..802f2bc00d69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head, { struct udphdr *uh = udp_gro_udphdr(skb); - if (unlikely(!uh)) + if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index bcfc00e88756..f8de2482a529 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 3d36644890bb..1ad2c2c4e250 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c63ccce6425f..63a808d5af15 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -666,6 +666,7 @@ errout: static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int h, s_h; int idx, s_idx; @@ -673,6 +674,21 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct inet6_dev *idev; struct hlist_head *head; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -692,7 +708,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, if (inet6_netconf_fill_devconf(skb, dev->ifindex, &idev->cnf, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -709,7 +725,7 @@ cont: if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -720,7 +736,7 @@ cont: if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -997,6 +1013,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, if (addr_type == IPV6_ADDR_ANY || addr_type & IPV6_ADDR_MULTICAST || (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); @@ -4489,6 +4506,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .len = sizeof(u32) }, [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, + [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, }; static int @@ -4791,19 +4809,40 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; +}; + static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet6_fill_args *args) { struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + goto error; + if (!((ifa->flags&IFA_F_PERMANENT) && (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; @@ -4853,7 +4892,7 @@ error: } static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, - u32 portid, u32 seq, int event, u16 flags) + struct inet6_fill_args *args) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -4862,10 +4901,15 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + return -EMSGSIZE; + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, @@ -4879,7 +4923,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, } static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, - u32 portid, u32 seq, int event, unsigned int flags) + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; @@ -4889,10 +4933,15 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + if (args->netnsid >= 0 && + nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) + return -EMSGSIZE; + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, @@ -4905,68 +4954,56 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return 0; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - /* called with rcu_read_lock() */ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, - struct netlink_callback *cb, enum addr_type_t type, - int s_ip_idx, int *p_ip_idx) + struct netlink_callback *cb, int s_ip_idx, + struct inet6_fill_args *fillargs) { struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; + int ip_idx = 0; int err = 1; - int ip_idx = *p_ip_idx; read_lock_bh(&idev->lock); - switch (type) { + switch (fillargs->type) { case UNICAST_ADDR: { struct inet6_ifaddr *ifa; + fillargs->event = RTM_NEWADDR; /* unicast address incl. temp addr */ list_for_each_entry(ifa, &idev->addr_list, if_list) { - if (++ip_idx < s_ip_idx) - continue; - err = inet6_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWADDR, - NLM_F_MULTI); + if (ip_idx < s_ip_idx) + goto next; + err = inet6_fill_ifaddr(skb, ifa, fillargs); if (err < 0) break; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +next: + ip_idx++; } break; } case MULTICAST_ADDR: + fillargs->event = RTM_GETMULTICAST; + /* multicast address */ for (ifmca = idev->mc_list; ifmca; ifmca = ifmca->next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifmcaddr(skb, ifmca, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_GETMULTICAST, - NLM_F_MULTI); + err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } break; case ANYCAST_ADDR: + fillargs->event = RTM_GETANYCAST; /* anycast address */ for (ifaca = idev->ac_list; ifaca; ifaca = ifaca->aca_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifacaddr(skb, ifaca, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_GETANYCAST, - NLM_F_MULTI); + err = inet6_fill_ifacaddr(skb, ifaca, fillargs); if (err < 0) break; } @@ -4975,42 +5012,128 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, break; } read_unlock_bh(&idev->lock); - *p_ip_idx = ip_idx; + cb->args[2] = ip_idx; return err; } +static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, + struct inet6_fill_args *fillargs, + struct net **tgt_net, struct sock *sk, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[IFA_MAX+1]; + struct ifaddrmsg *ifm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); + return -EINVAL; + } + + fillargs->ifindex = ifm->ifa_index; + if (fillargs->ifindex) { + cb->answer_flags |= NLM_F_DUMP_FILTERED; + fillargs->flags |= NLM_F_DUMP_FILTERED; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= IFA_MAX; ++i) { + if (!tb[i]) + continue; + + if (i == IFA_TARGET_NETNSID) { + struct net *net; + + fillargs->netnsid = nla_get_s32(tb[i]); + net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); + if (IS_ERR(net)) { + fillargs->netnsid = -1; + NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id"); + return PTR_ERR(net); + } + *tgt_net = net; + } else { + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { + const struct nlmsghdr *nlh = cb->nlh; + struct inet6_fill_args fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .netnsid = -1, + .type = type, + }; struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; + int idx, s_idx, s_ip_idx; int h, s_h; - int idx, ip_idx; - int s_idx, s_ip_idx; struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; + int err = 0; s_h = cb->args[0]; s_idx = idx = cb->args[1]; - s_ip_idx = ip_idx = cb->args[2]; + s_ip_idx = cb->args[2]; + + if (cb->strict_check) { + err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, + skb->sk, cb); + if (err < 0) + goto put_tgt_net; + + err = 0; + if (fillargs.ifindex) { + dev = __dev_get_by_index(tgt_net, fillargs.ifindex); + if (!dev) { + err = -ENODEV; + goto put_tgt_net; + } + idev = __in6_dev_get(dev); + if (idev) { + err = in6_dump_addrs(idev, skb, cb, s_ip_idx, + &fillargs); + } + goto put_tgt_net; + } + } rcu_read_lock(); - cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; + cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; - head = &net->dev_index_head[h]; + head = &tgt_net->dev_index_head[h]; hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; if (h > s_h || idx > s_idx) s_ip_idx = 0; - ip_idx = 0; idev = __in6_dev_get(dev); if (!idev) goto cont; - if (in6_dump_addrs(idev, skb, cb, type, - s_ip_idx, &ip_idx) < 0) + if (in6_dump_addrs(idev, skb, cb, s_ip_idx, + &fillargs) < 0) goto done; cont: idx++; @@ -5020,9 +5143,11 @@ done: rcu_read_unlock(); cb->args[0] = h; cb->args[1] = idx; - cb->args[2] = ip_idx; +put_tgt_net: + if (fillargs.netnsid >= 0) + put_net(tgt_net); - return skb->len; + return err < 0 ? err : skb->len; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) @@ -5051,6 +5176,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); + struct inet6_fill_args fillargs = { + .portid = NETLINK_CB(in_skb).portid, + .seq = nlh->nlmsg_seq, + .event = RTM_NEWADDR, + .flags = 0, + .netnsid = -1, + }; + struct net *tgt_net = net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL, *peer; @@ -5064,15 +5197,24 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (tb[IFA_TARGET_NETNSID]) { + fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk, + fillargs.netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); if (!addr) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(tgt_net, ifm->ifa_index); - ifa = ipv6_get_ifaddr(net, addr, dev, 1); + ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1); if (!ifa) { err = -EADDRNOTAVAIL; goto errout; @@ -5084,20 +5226,22 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout_ifa; } - err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWADDR, 0); + err = inet6_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout_ifa; } - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid); errout_ifa: in6_ifa_put(ifa); errout: if (dev) dev_put(dev); + if (fillargs.netnsid >= 0) + put_net(tgt_net); + return err; } @@ -5105,13 +5249,20 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; struct net *net = dev_net(ifa->idev->dev); + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + }; int err = -ENOBUFS; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; - err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); + err = inet6_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ WARN_ON(err == -EMSGSIZE); @@ -5527,6 +5678,31 @@ nla_put_failure: return -EMSGSIZE; } +static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); + return -EINVAL; + } + + return 0; +} + static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -5536,6 +5712,16 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct inet6_dev *idev; struct hlist_head *head; + /* only requests using strict checking can pass data to + * influence the dump + */ + if (cb->strict_check) { + int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_h = cb->args[0]; s_idx = cb->args[1]; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 1d6ced37ad71..0d1ee82ee55b 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -458,20 +458,52 @@ static int ip6addrlbl_fill(struct sk_buff *skb, return 0; } +static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifaddrlblmsg *ifal; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); + return -EINVAL; + } + + ifal = nlmsg_data(nlh); + if (ifal->__ifal_reserved || ifal->ifal_prefixlen || + ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifal))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst"); + return -EINVAL; + } + + return 0; +} + static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; int err; + if (cb->strict_check) { + err = ip6addrlbl_valid_dump_req(nlh, cb->extack); + if (err < 0) + return err; + } + rcu_read_lock(); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWADDRLABEL, NLM_F_MULTI); if (err < 0) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9a4261e50272..f0cd291034f0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -209,6 +209,7 @@ lookup_protocol: np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; + np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; @@ -467,12 +468,10 @@ void inet6_destroy_sock(struct sock *sk) /* Release rx options */ skb = xchg(&np->pktoptions, NULL); - if (skb) - kfree_skb(skb); + kfree_skb(skb); skb = xchg(&np->rxpmtu, NULL); - if (skb) - kfree_skb(skb); + kfree_skb(skb); /* Free flowlabels */ fl6_free_socklist(sk); @@ -902,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = { static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, + .udp6_lib_lookup = __udp6_lib_lookup, }; static int __init inet6_init(void) @@ -1001,6 +1001,9 @@ static int __init inet6_init(void) err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; + err = ipv6_anycast_init(); + if (err) + goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; @@ -1091,6 +1094,8 @@ ipv6_frag_fail: ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: + ipv6_anycast_cleanup(); +ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4e0ff7031edd..94999058e110 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -44,8 +44,22 @@ #include +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) +/* anycast address hash table + */ +static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(acaddr_hash_lock); + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); +static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +{ + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); +} + /* * socket join an anycast group */ @@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk) rtnl_unlock(); } +static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) +{ + unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); + + spin_lock(&acaddr_hash_lock); + hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); + spin_unlock(&acaddr_hash_lock); +} + +static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) +{ + spin_lock(&acaddr_hash_lock); + hlist_del_init_rcu(&aca->aca_addr_lst); + spin_unlock(&acaddr_hash_lock); +} + static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } +static void aca_free_rcu(struct rcu_head *h) +{ + struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); + + fib6_info_release(aca->aca_rt); + kfree(aca); +} + static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - fib6_info_release(ac->aca_rt); - kfree(ac); + call_rcu(&ac->rcu, aca_free_rcu); } } @@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; + INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -285,6 +323,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); + ipv6_add_acaddr_hash(net, aca); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -325,6 +365,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) else idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -352,6 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); + addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -390,17 +433,25 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { + unsigned int hash = inet6_acaddr_hash(net, addr); + struct net_device *nh_dev; + struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], + aca_addr_lst) { + nh_dev = fib6_info_nh_dev(aca->aca_rt); + if (!nh_dev || !net_eq(dev_net(nh_dev), net)) + continue; + if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } + } rcu_read_unlock(); return found; } @@ -540,3 +591,24 @@ void ac6_proc_exit(struct net *net) remove_proc_entry("anycast6", net->proc_net); } #endif + +/* Init / cleanup code + */ +int __init ipv6_anycast_init(void) +{ + int i; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); + return 0; +} + +void ipv6_anycast_cleanup(void) +{ + int i; + + spin_lock(&acaddr_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); + spin_unlock(&acaddr_hash_lock); +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 88a7579c23bd..63b2b66f9dfa 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -601,12 +601,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err) static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ip_esp_hdr *esph; struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int ivlen = crypto_aead_ivsize(aead); - int elen = skb->len - sizeof(*esph) - ivlen; + int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen; int nfrags; int assoclen; int seqhilen; @@ -616,7 +615,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) u8 *iv; struct scatterlist *sg; - if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) { + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) { ret = -EINVAL; goto out; } @@ -626,7 +625,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } - assoclen = sizeof(*esph); + assoclen = sizeof(struct ip_esp_hdr); seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 547515e8450a..377717045f8f 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -88,8 +88,24 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) * Note, we are only interested in != 0 or == 0, thus the * force to int. */ - return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, - ip6_compute_pseudo); + err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); + if (err) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { + /* If SW calculated the value, we know it's bad */ + if (skb->csum_complete_sw) + return 1; + + /* HW says the value is bad. Let's validate that. + * skb->csum is no longer the full packet checksum, + * so don't treat is as such. + */ + skb_checksum_complete_unset(skb); + } + + return 0; } EXPORT_SYMBOL(udp6_csum_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5516f55e214b..ae3786132c23 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -46,6 +47,7 @@ struct fib6_cleaner { int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; + bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES @@ -160,8 +162,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) } INIT_LIST_HEAD(&f6i->fib6_siblings); - f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - atomic_inc(&f6i->fib6_ref); return f6i; @@ -171,7 +171,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); struct rt6_exception_bucket *bucket; - struct dst_metrics *m; WARN_ON(f6i->fib6_node); @@ -196,6 +195,8 @@ void fib6_info_destroy_rcu(struct rcu_head *head) *ppcpu_rt = NULL; } } + + free_percpu(f6i->rt6i_pcpu); } lwtstate_put(f6i->fib6_nh.nh_lwtstate); @@ -203,9 +204,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); - m = f6i->fib6_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); + ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } @@ -568,17 +567,31 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); + struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; - struct rt6_rtnl_dump_arg arg; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; - s_h = cb->args[0]; - s_e = cb->args[1]; + if (cb->strict_check) { + int err; + + err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); + if (err < 0) + return err; + } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { + struct rtmsg *rtm = nlmsg_data(nlh); + + arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + } + + /* fib entries are never clones */ + if (arg.filter.flags & RTM_F_CLONED) + goto out; w = (void *)cb->args[2]; if (!w) { @@ -604,6 +617,23 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) arg.net = net; w->args = &arg; + if (arg.filter.table_id) { + tb = fib6_get_table(net, arg.filter.table_id); + if (!tb) { + if (arg.filter.dump_all_families) + goto out; + + NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); + return -ENOENT; + } + + res = fib6_dump_table(tb, skb, cb); + goto out; + } + + s_h = cb->args[0]; + s_e = cb->args[1]; + rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; @@ -613,16 +643,16 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) - goto out; + goto out_unlock; next: e++; } } -out: +out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - +out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); @@ -1952,6 +1982,7 @@ static int fib6_clean_node(struct fib6_walker *w) struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, + .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && @@ -2003,7 +2034,7 @@ static int fib6_clean_node(struct fib6_walker *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), - int sernum, void *arg) + int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; @@ -2015,13 +2046,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.sernum = sernum; c.arg = arg; c.net = net; + c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), - int sernum, void *arg) + int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; @@ -2033,7 +2065,7 @@ static void __fib6_clean_all(struct net *net, hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, sernum, arg); + func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } @@ -2043,14 +2075,21 @@ static void __fib6_clean_all(struct net *net, void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { - __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); +} + +void fib6_clean_all_skip_notify(struct net *net, + int (*func)(struct fib6_info *, void *), + void *arg) +{ + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); - __fib6_clean_all(net, NULL, new_sernum, NULL); + __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index e493b041d4ac..515adbdba1d2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); - const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; - int grehlen = sizeof(*greh); + struct tnl_ptk_info tpi; struct ip6_tnl *t; - int key_off = 0; - __be16 flags; - __be32 key; - if (!pskb_may_pull(skb, offset + grehlen)) - return; - greh = (const struct gre_base_hdr *)(skb->data + offset); - flags = greh->flags; - if (flags & (GRE_VERSION | GRE_ROUTING)) + if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), + offset) < 0) return; - if (flags & GRE_CSUM) - grehlen += 4; - if (flags & GRE_KEY) { - key_off = grehlen + offset; - grehlen += 4; - } - if (!pskb_may_pull(skb, offset + grehlen)) - return; ipv6h = (const struct ipv6hdr *)skb->data; - greh = (const struct gre_base_hdr *)(skb->data + offset); - key = key_off ? *(__be32 *)(skb->data + key_off) : 0; - t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, - key, greh->protocol); + tpi.key, tpi.proto); if (!t) return; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6242682be876..96577e742afd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, */ if ((ipv6_addr_loopback(&hdr->saddr) || ipv6_addr_loopback(&hdr->daddr)) && - !(dev->flags & IFF_LOOPBACK)) + !(dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev)) goto err; /* RFC4291 Errata ID: 3480 diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f9f8f554d141..89e0d5118afe 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -725,7 +725,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb = frag; frag = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } kfree(tmp_hdr); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a0b6932c3afd..a9d06d4dd057 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,11 +1184,6 @@ route_lookup: } skb_dst_set(skb, dst); - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); - } - if (hop_limit == 0) { if (skb->protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; @@ -1210,6 +1205,11 @@ route_lookup: if (err) return err; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_frag_opts(skb, &opt.ops, &proto); + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d0b7e0249c13..e2ea691e42c6 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, @@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(flp6)); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) @@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ip6mr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(net, mrt, skb, c); + ip6_mr_forward(net, mrt, skb->dev, skb, c); } } @@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; @@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c) + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) { int psend = -1; int vif, ct; - int true_vifi = ip6mr_find_vif(mrt, skb->dev); + int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; @@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2154,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb) .flowi6_mark = skb->mark, }; int err; + struct net_device *dev; + + /* skb->dev passed in is the master dev for vrfs. + * Get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { @@ -2165,7 +2189,7 @@ int ip6_mr_input(struct sk_buff *skb) cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { - int vif = ip6mr_find_vif(mrt, skb->dev); + int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, @@ -2179,9 +2203,9 @@ int ip6_mr_input(struct sk_buff *skb) if (!cache) { int vif; - vif = ip6mr_find_vif(mrt, skb->dev); + vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { - int err = ip6mr_cache_unresolved(mrt, vif, skb); + int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); read_unlock(&mrt_lock); return err; @@ -2191,7 +2215,7 @@ int ip6_mr_input(struct sk_buff *skb) return -ENODEV; } - ip6_mr_forward(net, mrt, skb, cache); + ip6_mr_forward(net, mrt, dev, skb, cache); read_unlock(&mrt_lock); @@ -2257,7 +2281,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; - err = ip6mr_cache_unresolved(mrt, vif, skb2); + err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); read_unlock(&mrt_lock); return err; @@ -2433,6 +2457,33 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct fib_dump_filter filter = {}; + int err; + + if (cb->strict_check) { + err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, + &filter, cb); + if (err < 0) + return err; + } + + if (filter.table_id) { + struct mr_table *mrt; + + mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); + if (!mrt) { + if (filter.dump_all_families) + return skb->len; + + NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); + return -ENOENT; + } + err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, + &mfc_unres_lock, &filter); + return skb->len ? : err; + } + return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, - _ip6mr_fill_mroute, &mfc_unres_lock); + _ip6mr_fill_mroute, &mfc_unres_lock, &filter); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c0cac9cc3a28..381ce38940ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -674,6 +674,13 @@ done: retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } + case IPV6_MULTICAST_ALL: + if (optlen < sizeof(int)) + goto e_inval; + np->mc_all = valbool; + retv = 0; + break; + case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: { @@ -1266,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->mcast_oif; break; + case IPV6_MULTICAST_ALL: + val = np->mc_all; + break; + case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) np->ucast_oif); break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4ae54aaca373..21f6deb2aec9 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, } if (!mc) { rcu_read_unlock(); - return true; + return np->mc_all; } read_lock(&mc->sflock); psl = mc->sflist; @@ -2436,17 +2436,17 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; - /* callers have the socket lock and rtnl lock - * so no other readers or writers of iml or its sflist - */ + write_lock_bh(&iml->sflock); if (!iml->sflist) { /* any-source empty exclude case */ - return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } else { + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; } - err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - iml->sflist->sl_count, iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + write_unlock_bh(&iml->sflock); return err; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0ec273997d1d..659ecf4e4b3c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), - skb->dev->ifindex, 0); + skb->dev->ifindex); return; } @@ -1732,10 +1732,9 @@ int ndisc_rcv(struct sk_buff *skb) return 0; } - memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); ndisc_recv_ns(skb); break; @@ -1784,6 +1783,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); + if (!netif_carrier_ok(dev)) + neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 8b147440fbdc..af737b47b9b5 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -65,7 +65,10 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) } hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - BUG_ON(hp == NULL); + if (!hp) { + par->hotdrop = true; + return false; + } /* Calculate the header length */ if (nexthdr == NEXTHDR_FRAGMENT) diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 2c99b94eeca3..21bf6bf04323 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -137,7 +137,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_addr), &_addr); - BUG_ON(ap == NULL); + if (ap == NULL) { + par->hotdrop = true; + return false; + } if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { pr_debug("i=%d temp=%d;\n", i, temp); @@ -166,7 +169,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) + temp * sizeof(_addr), sizeof(_addr), &_addr); - BUG_ON(ap == NULL); + if (ap == NULL) { + par->hotdrop = true; + return false; + } if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 8f68a518d9db..d219979c3e52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -450,7 +450,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic sub_frag_mem_limit(fq->q.net, head->truesize); head->ignore_df = 1; - head->next = NULL; + skb_mark_not_on_list(head); head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); @@ -587,11 +587,16 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) */ ret = -EINPROGRESS; if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - else + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + if (nf_ct_frag6_reasm(fq, skb, dev)) + ret = 0; + skb->_skb_refdst = orefdst; + } else { skb_dst_drop(skb); + } out_unlock: spin_unlock_bh(&fq->q.lock); diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index e6eb7cf9b54f..3e4bf2286abe 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -87,18 +87,30 @@ static struct notifier_block masq_dev_notifier = { struct masq_dev_work { struct work_struct work; struct net *net; + struct in6_addr addr; int ifindex; }; +static int inet_cmp(struct nf_conn *ct, void *work) +{ + struct masq_dev_work *w = (struct masq_dev_work *)work; + struct nf_conntrack_tuple *tuple; + + if (!device_cmp(ct, (void *)(long)w->ifindex)) + return 0; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); +} + static void iterate_cleanup_work(struct work_struct *work) { struct masq_dev_work *w; - long index; w = container_of(work, struct masq_dev_work, work); - index = w->ifindex; - nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0); + nf_ct_iterate_cleanup_net(w->net, inet_cmp, (void *)w, 0, 0); put_net(w->net); kfree(w); @@ -147,6 +159,7 @@ static int masq_inet_event(struct notifier_block *this, INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = dev->ifindex; w->net = net; + w->addr = ifa->addr; schedule_work(&w->work); return NOTIFY_DONE; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..5e0efd3954e9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -651,8 +651,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb->tstamp = sockc->transmit_time; - skb_dst_set(skb, &rt->dst); - *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -665,8 +663,14 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); - if (err) - goto error_fault; + if (err) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + skb_dst_set(skb, &rt->dst); + *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -675,21 +679,28 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (unlikely(!skb)) return 0; + /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev + * in the error path. Since skb has been freed, the dst could + * have been queued for deletion. + */ + rcu_read_lock(); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); - if (err) - goto error; + if (err) { + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + goto error_check; + } + rcu_read_unlock(); out: return 0; -error_fault: - err = -EFAULT; - kfree_skb(skb); error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); +error_check: if (err == -ENOBUFS && !np->recverr) err = 0; return err; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c5b4f79296e..5c3c92713096 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -145,7 +145,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) - goto err; + goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { @@ -162,20 +162,20 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) - goto err; + goto discard_fq; fq->q.len = end; } } if (end == offset) - goto err; + goto discard_fq; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) - goto err; + goto discard_fq; if (pskb_trim_rcsum(skb, end - offset)) - goto err; + goto discard_fq; /* Find out which fragments are in front and at the back of us * in the chain of fragments so far. We must know where to put @@ -388,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } sub_frag_mem_limit(fq->q.net, sum_truesize); - head->next = NULL; + skb_mark_not_on_list(head); head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); @@ -418,6 +418,7 @@ out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); + inet_frag_kill(&fq->q); return -1; } @@ -553,7 +554,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[0].data = &net->ipv6.frags.high_thresh; table[0].extra1 = &net->ipv6.frags.low_thresh; - table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 826b14de7dbb..2a7423c39456 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -364,14 +364,11 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -520,10 +517,11 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct fib6_info *rt) { - struct __rt6_probe_work *work; + struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; struct neighbour *neigh; struct net_device *dev; + struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate @@ -539,15 +537,12 @@ static void rt6_probe(struct fib6_info *rt) nh_gw = &rt->fib6_nh.nh_gw; dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); + idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { - struct inet6_dev *idev; - if (neigh->nud_state & NUD_VALID) goto out; - idev = __in6_dev_get(dev); - work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, @@ -557,11 +552,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else { + } else if (time_after(jiffies, rt->last_probe + + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { + rt->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -978,11 +975,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); - dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); - if (from->fib6_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&from->fib6_metrics->refcnt); - } + ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to @ort */ @@ -1000,7 +993,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = ort->fib6_src; #endif - rt->rt6i_prefsrc = ort->fib6_prefsrc; } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, @@ -1454,11 +1446,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (ort->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif - - /* Update rt6i_prefsrc as it could be changed - * in rt6_remove_prefsrc() - */ - nrt->rt6i_prefsrc = ort->fib6_prefsrc; /* rt6_mtu_change() might lower mtu on ort. * Only insert this exception route if its mtu * is less than ort's mtu value. @@ -1640,25 +1627,6 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } -static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) -{ - struct rt6_exception_bucket *bucket; - struct rt6_exception *rt6_ex; - int i; - - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - - if (bucket) { - for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { - rt6_ex->rt6i->rt6i_prefsrc.plen = 0; - } - bucket++; - } - } -} - static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { @@ -2103,7 +2071,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, { bool any_src; - if (rt6_need_strict(&fl6->daddr)) { + if (ipv6_addr_type(&fl6->daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; dst = l3mdev_link_scope_lookup(net, fl6); @@ -2373,15 +2342,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_oif = oif, + .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -2532,16 +2500,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .flowi6_mark = mark, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); @@ -2549,21 +2516,18 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, } EXPORT_SYMBOL_GPL(ip6_redirect); -void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, - u32 mark) +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = msg->dest; - fl6.saddr = iph->daddr; - fl6.flowi6_uid = sock_net_uid(net, NULL); + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .daddr = msg->dest, + .saddr = iph->daddr, + .flowi6_uid = sock_net_uid(net, NULL), + }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); @@ -2734,24 +2698,6 @@ out: return entries > rt_max_size; } -static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, - struct fib6_config *cfg) -{ - struct dst_metrics *p; - - if (!cfg->fc_mx) - return 0; - - p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); - if (unlikely(!p)) - return -ENOMEM; - - refcount_set(&p->refcnt, 1); - rt->fib6_metrics = p; - - return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); -} - static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, @@ -2799,6 +2745,8 @@ static int ip6_route_check_nh_onlink(struct net *net, grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); if (grt) { if (!grt->dst.error && + /* ignore match if it is the default route */ + grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) && (grt->rt6i_flags & flags || dev != grt->dst.dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); @@ -3027,13 +2975,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!rt) goto out; + rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len); + if (IS_ERR(rt->fib6_metrics)) { + err = PTR_ERR(rt->fib6_metrics); + /* Do not leave garbage there. */ + rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; + goto out; + } + if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; - err = ip6_convert_metrics(net, rt, cfg); - if (err < 0) - goto out; - if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -3142,8 +3094,6 @@ install_route: rt->fib6_nh.nh_dev = dev; rt->fib6_table = table; - cfg->fc_nlinfo.nl_net = dev_net(dev); - if (idev) in6_dev_put(idev); @@ -3635,23 +3585,23 @@ static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { - memset(cfg, 0, sizeof(*cfg)); + *cfg = (struct fib6_config){ + .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? + : RT6_TABLE_MAIN, + .fc_ifindex = rtmsg->rtmsg_ifindex, + .fc_metric = rtmsg->rtmsg_metric, + .fc_expires = rtmsg->rtmsg_info, + .fc_dst_len = rtmsg->rtmsg_dst_len, + .fc_src_len = rtmsg->rtmsg_src_len, + .fc_flags = rtmsg->rtmsg_flags, + .fc_type = rtmsg->rtmsg_type, - cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? - : RT6_TABLE_MAIN; - cfg->fc_ifindex = rtmsg->rtmsg_ifindex; - cfg->fc_metric = rtmsg->rtmsg_metric; - cfg->fc_expires = rtmsg->rtmsg_info; - cfg->fc_dst_len = rtmsg->rtmsg_dst_len; - cfg->fc_src_len = rtmsg->rtmsg_src_len; - cfg->fc_flags = rtmsg->rtmsg_flags; - cfg->fc_type = rtmsg->rtmsg_type; - - cfg->fc_nlinfo.nl_net = net; + .fc_nlinfo.nl_net = net, - cfg->fc_dst = rtmsg->rtmsg_dst; - cfg->fc_src = rtmsg->rtmsg_src; - cfg->fc_gateway = rtmsg->rtmsg_gateway; + .fc_dst = rtmsg->rtmsg_dst, + .fc_src = rtmsg->rtmsg_src, + .fc_gateway = rtmsg->rtmsg_gateway, + }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) @@ -3758,6 +3708,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, if (!f6i) return ERR_PTR(-ENOMEM); + f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0); f6i->dst_nocount = true; f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; @@ -3800,8 +3751,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; - /* need to update cache as well */ - rt6_exceptions_remove_prefsrc(rt); spin_unlock_bh(&rt6_exception_lock); } return 0; @@ -4079,8 +4028,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) .event = event, }, }; + struct net *net = dev_net(dev); - fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); + if (net->ipv6.sysctl.skip_notify_on_dev_down) + fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); + else + fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) @@ -4170,20 +4123,25 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, - NULL); + extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); - memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = rtm->rtm_table; - cfg->fc_dst_len = rtm->rtm_dst_len; - cfg->fc_src_len = rtm->rtm_src_len; - cfg->fc_flags = RTF_UP; - cfg->fc_protocol = rtm->rtm_protocol; - cfg->fc_type = rtm->rtm_type; + *cfg = (struct fib6_config){ + .fc_table = rtm->rtm_table, + .fc_dst_len = rtm->rtm_dst_len, + .fc_src_len = rtm->rtm_src_len, + .fc_flags = RTF_UP, + .fc_protocol = rtm->rtm_protocol, + .fc_type = rtm->rtm_type, + + .fc_nlinfo.portid = NETLINK_CB(skb).portid, + .fc_nlinfo.nlh = nlh, + .fc_nlinfo.nl_net = sock_net(skb->sk), + }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || @@ -4199,10 +4157,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); - cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; - cfg->fc_nlinfo.nlh = nlh; - cfg->fc_nlinfo.nl_net = sock_net(skb->sk); - if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4321,11 +4275,6 @@ static int ip6_route_info_append(struct net *net, if (!nh) return -ENOMEM; nh->fib6_info = rt; - err = ip6_convert_metrics(net, rt, r_cfg); - if (err) { - kfree(nh); - return err; - } memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->next, rt6_nh_list); @@ -4820,28 +4769,52 @@ nla_put_failure: return -EMSGSIZE; } +static bool fib6_info_uses_dev(const struct fib6_info *f6i, + const struct net_device *dev) +{ + if (f6i->fib6_nh.nh_dev == dev) + return true; + + if (f6i->fib6_nsiblings) { + struct fib6_info *sibling, *next_sibling; + + list_for_each_entry_safe(sibling, next_sibling, + &f6i->fib6_siblings, fib6_siblings) { + if (sibling->fib6_nh.nh_dev == dev) + return true; + } + } + + return false; +} + int rt6_dump_route(struct fib6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + struct fib_dump_filter *filter = &arg->filter; + unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; if (rt == net->ipv6.fib6_null_entry) return 0; - if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { - struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); - - /* user wants prefix routes only */ - if (rtm->rtm_flags & RTM_F_PREFIX && - !(rt->fib6_flags & RTF_PREFIX_RT)) { - /* success since this is not a prefix route */ + if ((filter->flags & RTM_F_PREFIX) && + !(rt->fib6_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + if (filter->filter_set) { + if ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol)) { return 1; } + flags |= NLM_F_DUMP_FILTERED; } return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); + arg->cb->nlh->nlmsg_seq, flags); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, @@ -4855,7 +4828,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi6 fl6; + struct flowi6 fl6 = {}; bool fibmatch; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, @@ -4864,7 +4837,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; err = -EINVAL; - memset(&fl6, 0, sizeof(fl6)); rtm = nlmsg_data(nlh); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); @@ -5089,7 +5061,10 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, return 0; } -struct ctl_table ipv6_route_table_template[] = { +static int zero; +static int one = 1; + +static struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, @@ -5160,6 +5135,15 @@ struct ctl_table ipv6_route_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, + { + .procname = "skip_notify_on_dev_down", + .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -5183,6 +5167,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -5247,6 +5232,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + net->ipv6.sysctl.skip_notify_on_dev_down = 0; net->ipv6.ip6_rt_gc_expire = 30*HZ; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e9400ffa7875..51c9f75f34b9 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -534,13 +534,13 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, iph->protocol, 0); + t->parms.link, iph->protocol); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - iph->protocol, 0); + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, + iph->protocol); err = 0; goto out; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 28c4aa5078fc..d2d97d07ef27 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -478,7 +478,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), 0, udptable, skb); + inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -548,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_enable(&udpv6_encap_needed_key); @@ -766,11 +766,9 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, ret = udpv6_queue_rcv_skb(sk, skb); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ + /* a return value > 0 means to resubmit the input */ if (ret > 0) - return -ret; + return ret; return 0; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 95dee9ca8d22..1b8e161ac527 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -119,7 +119,7 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head, { struct udphdr *uh = udp_gro_udphdr(skb); - if (unlikely(!uh)) + if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841f4a07438e..9ef490dddcea 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -59,6 +59,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return -1; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 9ad07a91708e..3c29da5defe6 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -51,7 +51,6 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -60,8 +59,7 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5959ce9620eb..6a74080005cf 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -170,9 +170,11 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef3defaf43b9..d35bcf92969c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; - while (nh + offset + 1 < skb->data || - pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + while (nh + offset + sizeof(*exthdr) < skb->data || + pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index e2f16a0173a9..0bed4cc20603 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -48,7 +48,7 @@ static struct iucv_interface *pr_iucv; static const u8 iprm_shutdown[8] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; -#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class)) +#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class) #define __iucv_sock_wait(sk, condition, timeo, ret) \ do { \ @@ -320,13 +320,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, struct sk_buff *nskb; int err, confirm_recv = 0; - memset(skb->head, 0, ETH_HLEN); - phs_hdr = skb_push(skb, sizeof(struct af_iucv_trans_hdr)); - skb_reset_mac_header(skb); + phs_hdr = skb_push(skb, sizeof(*phs_hdr)); + memset(phs_hdr, 0, sizeof(*phs_hdr)); skb_reset_network_header(skb); - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr)); phs_hdr->magic = ETH_P_AF_IUCV; phs_hdr->version = 1; @@ -350,6 +346,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, if (imsg) memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message)); + skb_push(skb, ETH_HLEN); + memset(skb->data, 0, ETH_HLEN); + skb->dev = iucv->hs_dev; if (!skb->dev) { err = -ENODEV; @@ -1505,7 +1504,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -1943,8 +1942,7 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16]) /***************** HiperSockets transport callbacks ********************/ static void afiucv_swap_src_dest(struct sk_buff *skb) { - struct af_iucv_trans_hdr *trans_hdr = - (struct af_iucv_trans_hdr *)skb->data; + struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb); char tmpID[8]; char tmpName[8]; @@ -1967,13 +1965,12 @@ static void afiucv_swap_src_dest(struct sk_buff *skb) **/ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) { + struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb); struct sock *nsk; struct iucv_sock *iucv, *niucv; - struct af_iucv_trans_hdr *trans_hdr; int err; iucv = iucv_sk(sk); - trans_hdr = (struct af_iucv_trans_hdr *)skb->data; if (!iucv) { /* no sock - connection refused */ afiucv_swap_src_dest(skb); @@ -2034,15 +2031,13 @@ out: static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - struct af_iucv_trans_hdr *trans_hdr = - (struct af_iucv_trans_hdr *)skb->data; if (!iucv) goto out; if (sk->sk_state != IUCV_BOUND) goto out; bh_lock_sock(sk); - iucv->msglimit_peer = trans_hdr->window; + iucv->msglimit_peer = iucv_trans_hdr(skb)->window; sk->sk_state = IUCV_CONNECTED; sk->sk_state_change(sk); bh_unlock_sock(sk); @@ -2098,8 +2093,6 @@ out: static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - struct af_iucv_trans_hdr *trans_hdr = - (struct af_iucv_trans_hdr *)skb->data; if (!iucv) return NET_RX_SUCCESS; @@ -2107,7 +2100,7 @@ static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb) if (sk->sk_state != IUCV_CONNECTED) return NET_RX_SUCCESS; - atomic_sub(trans_hdr->window, &iucv->msg_sent); + atomic_sub(iucv_trans_hdr(skb)->window, &iucv->msg_sent); iucv_sock_wake_msglim(sk); return NET_RX_SUCCESS; } @@ -2170,22 +2163,13 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, int err = NET_RX_SUCCESS; char nullstring[8]; - if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) { - WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d", - (int)skb->len, - (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr))); + if (!pskb_may_pull(skb, sizeof(*trans_hdr))) { + WARN_ONCE(1, "AF_IUCV failed to receive skb, len=%u", skb->len); kfree_skb(skb); return NET_RX_SUCCESS; } - if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) - if (skb_linearize(skb)) { - WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d", - (int)skb->len); - kfree_skb(skb); - return NET_RX_SUCCESS; - } - skb_pull(skb, ETH_HLEN); - trans_hdr = (struct af_iucv_trans_hdr *)skb->data; + + trans_hdr = iucv_trans_hdr(skb); EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName)); EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID)); EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName)); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 1beeea9549fa..b99e73a7e7e0 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -730,7 +730,6 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - unsigned long cpu_flags; size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; @@ -855,9 +854,8 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto copy_uaddr; if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); *seq = 0; } @@ -878,9 +876,8 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); *seq = 0; } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index c0ac522b48a1..4ff89cb7c86f 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -734,6 +734,7 @@ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) llc_sk(sk)->sap = sap; spin_lock_bh(&sap->sk_lock); + sock_set_flag(sk, SOCK_RCU_FREE); sap->sk_count++; sk_nulls_add_node_rcu(sk, laddr_hb); hlist_add_head(&llc->dev_hash_node, dev_hb); diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 260b3dc1b4a2..64d4bef04e73 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap) list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); - synchronize_rcu(); - - kfree(sap); + kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 76e30f4797fb..f869e35d0974 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -27,20 +27,6 @@ config MAC80211_RC_MINSTREL ---help--- This option enables the 'minstrel' TX rate control algorithm -config MAC80211_RC_MINSTREL_HT - bool "Minstrel 802.11n support" if EXPERT - depends on MAC80211_RC_MINSTREL - default y - ---help--- - This option enables the 'minstrel_ht' TX rate control algorithm - -config MAC80211_RC_MINSTREL_VHT - bool "Minstrel 802.11ac support" if EXPERT - depends on MAC80211_RC_MINSTREL_HT - default n - ---help--- - This option enables VHT in the 'minstrel_ht' TX rate control algorithm - choice prompt "Default rate control algorithm" depends on MAC80211_HAS_RC @@ -62,8 +48,7 @@ endchoice config MAC80211_RC_DEFAULT string - default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT - default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL + default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL default "" endif diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index bb707789ef2b..4f03ebe732fa 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -53,13 +53,14 @@ mac80211-$(CONFIG_PM) += pm.o CFLAGS_trace.o := -I$(src) -rc80211_minstrel-y := rc80211_minstrel.o -rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o +rc80211_minstrel-y := \ + rc80211_minstrel.o \ + rc80211_minstrel_ht.o -rc80211_minstrel_ht-y := rc80211_minstrel_ht.o -rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o +rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += \ + rc80211_minstrel_debugfs.o \ + rc80211_minstrel_ht_debugfs.o mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) -mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y) ccflags-y += -DDEBUG diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d25da0e66da1..51622333d460 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -158,12 +158,10 @@ static int ieee80211_change_iface(struct wiphy *wiphy, if (ret) return ret; - if (type == NL80211_IFTYPE_AP_VLAN && - params && params->use_4addr == 0) { + if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); - } else if (type == NL80211_IFTYPE_STATION && - params && params->use_4addr >= 0) { + } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { sdata->u.mgd.use_4addr = params->use_4addr; } @@ -427,7 +425,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ - if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP)) + if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: @@ -792,6 +790,48 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_set_ftm_responder_params( + struct ieee80211_sub_if_data *sdata, + const u8 *lci, size_t lci_len, + const u8 *civicloc, size_t civicloc_len) +{ + struct ieee80211_ftm_responder_params *new, *old; + struct ieee80211_bss_conf *bss_conf; + u8 *pos; + int len; + + if ((!lci || !lci_len) && (!civicloc || !civicloc_len)) + return 1; + + bss_conf = &sdata->vif.bss_conf; + old = bss_conf->ftmr_params; + len = lci_len + civicloc_len; + + new = kzalloc(sizeof(*new) + len, GFP_KERNEL); + if (!new) + return -ENOMEM; + + pos = (u8 *)(new + 1); + if (lci_len) { + new->lci_len = lci_len; + new->lci = pos; + memcpy(pos, lci, lci_len); + pos += lci_len; + } + + if (civicloc_len) { + new->civicloc_len = civicloc_len; + new->civicloc = pos; + memcpy(pos, civicloc, civicloc_len); + pos += civicloc_len; + } + + bss_conf->ftmr_params = new; + kfree(old); + + return 0; +} + static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa) @@ -865,6 +905,20 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, if (err == 0) changed |= BSS_CHANGED_AP_PROBE_RESP; + if (params->ftm_responder != -1) { + sdata->vif.bss_conf.ftm_responder = params->ftm_responder; + err = ieee80211_set_ftm_responder_params(sdata, + params->lci, + params->lci_len, + params->civicloc, + params->civicloc_len); + + if (err < 0) + return err; + + changed |= BSS_CHANGED_FTM_RESPONDER; + } + rcu_assign_pointer(sdata->u.ap.beacon, new); if (old) @@ -911,6 +965,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.beacon_int = params->beacon_interval; + if (params->he_cap) + sdata->vif.bss_conf.he_support = true; + mutex_lock(&local->mtx); err = ieee80211_vif_use_channel(sdata, ¶ms->chandef, IEEE80211_CHANCTX_SHARED); @@ -1062,6 +1119,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_probe_resp, rcu_head); sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; + kfree(sdata->vif.bss_conf.ftmr_params); + sdata->vif.bss_conf.ftmr_params = NULL; + __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); @@ -1092,50 +1152,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) return 0; } -/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ -struct iapp_layer2_update { - u8 da[ETH_ALEN]; /* broadcast */ - u8 sa[ETH_ALEN]; /* STA addr */ - __be16 len; /* 6 */ - u8 dsap; /* 0 */ - u8 ssap; /* 0 */ - u8 control; - u8 xid_info[3]; -} __packed; - -static void ieee80211_send_layer2_update(struct sta_info *sta) -{ - struct iapp_layer2_update *msg; - struct sk_buff *skb; - - /* Send Level 2 Update Frame to update forwarding tables in layer 2 - * bridge devices */ - - skb = dev_alloc_skb(sizeof(*msg)); - if (!skb) - return; - msg = skb_put(skb, sizeof(*msg)); - - /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) - * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ - - eth_broadcast_addr(msg->da); - memcpy(msg->sa, sta->sta.addr, ETH_ALEN); - msg->len = htons(6); - msg->dsap = 0; - msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ - msg->control = 0xaf; /* XID response lsb.1111F101. - * F=0 (no poll command; unsolicited frame) */ - msg->xid_info[0] = 0x81; /* XID format identifier */ - msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ - msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ - - skb->dev = sta->sdata->dev; - skb->protocol = eth_type_trans(skb, sta->sdata->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - netif_rx_ni(skb); -} - static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) @@ -1499,7 +1515,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, } if (layer2_update) - ieee80211_send_layer2_update(sta); + cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); rcu_read_unlock(); @@ -1601,7 +1617,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_inc_num_mcast(sta->sdata); - ieee80211_send_layer2_update(sta); + cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); } err = sta_apply_parameters(local, sta, params); @@ -2918,6 +2934,20 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } + if (beacon->ftm_responder) + new_beacon->ftm_responder = beacon->ftm_responder; + if (beacon->lci) { + new_beacon->lci_len = beacon->lci_len; + new_beacon->lci = pos; + memcpy(pos, beacon->lci, beacon->lci_len); + pos += beacon->lci_len; + } + if (beacon->civicloc) { + new_beacon->civicloc_len = beacon->civicloc_len; + new_beacon->civicloc = pos; + memcpy(pos, beacon->civicloc, beacon->civicloc_len); + pos += beacon->civicloc_len; + } return new_beacon; } @@ -3808,6 +3838,17 @@ out: return ret; } +static int +ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_ftm_responder_stats *ftm_stats) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + return drv_get_ftm_responder_stats(local, sdata, ftm_stats); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3902,4 +3943,5 @@ const struct cfg80211_ops mac80211_config_ops = { .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, + .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b5adf3625d16..3fe541e358f3 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -3,6 +3,7 @@ * * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2018 Intel Corporation * * GPLv2 * @@ -214,6 +215,9 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_TDLS_BUFFER_STA), FLAG(DEAUTH_NEED_MGD_TX_PREP), FLAG(DOESNT_SUPPORT_QOS_NDP), + FLAG(BUFF_MMPDU_TXQ), + FLAG(SUPPORTS_VHT_EXT_NSS_BW), + FLAG(STA_MMPDU_TXQ), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 4105081dc1df..af5185a836e5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -4,6 +4,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -140,7 +141,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->local; - size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1); + size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2); char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; struct txq_info *txqi; ssize_t rv; @@ -162,7 +163,9 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, bufsz+buf-p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + if (!sta->sta.txq[i]) + continue; txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz+buf-p, "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", @@ -487,12 +490,368 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf)+buf-p, "MCS TX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.tx_highest)); +#undef PFLAG } return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); } STA_OPS(vht_capa); +static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char *buf, *p; + size_t buf_sz = PAGE_SIZE; + struct sta_info *sta = file->private_data; + struct ieee80211_sta_he_cap *hec = &sta->sta.he_cap; + struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; + u8 ppe_size; + u8 *cap; + int i; + ssize_t ret; + + buf = kmalloc(buf_sz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = buf; + + p += scnprintf(p, buf_sz + buf - p, "HE %ssupported\n", + hec->has_he ? "" : "not "); + if (!hec->has_he) + goto out; + + cap = hec->he_cap_elem.mac_cap_info; + p += scnprintf(p, buf_sz + buf - p, + "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", + cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]); + +#define PRINT(fmt, ...) \ + p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \ + ##__VA_ARGS__) + +#define PFLAG(t, n, a, b) \ + do { \ + if (cap[n] & IEEE80211_HE_##t##_CAP##n##_##a) \ + PRINT("%s", b); \ + } while (0) + +#define PFLAG_RANGE(t, i, n, s, m, off, fmt) \ + do { \ + u8 msk = IEEE80211_HE_##t##_CAP##i##_##n##_MASK; \ + u8 idx = ((cap[i] & msk) >> (ffs(msk) - 1)) + off; \ + PRINT(fmt, (s << idx) + (m * idx)); \ + } while (0) + +#define PFLAG_RANGE_DEFAULT(t, i, n, s, m, off, fmt, a, b) \ + do { \ + if (cap[i] == IEEE80211_HE_##t ##_CAP##i##_##n##_##a) { \ + PRINT("%s", b); \ + break; \ + } \ + PFLAG_RANGE(t, i, n, s, m, off, fmt); \ + } while (0) + + PFLAG(MAC, 0, HTC_HE, "HTC-HE"); + PFLAG(MAC, 0, TWT_REQ, "TWT-REQ"); + PFLAG(MAC, 0, TWT_RES, "TWT-RES"); + PFLAG_RANGE_DEFAULT(MAC, 0, DYNAMIC_FRAG, 0, 1, 0, + "DYNAMIC-FRAG-LEVEL-%d", NOT_SUPP, "NOT-SUPP"); + PFLAG_RANGE_DEFAULT(MAC, 0, MAX_NUM_FRAG_MSDU, 1, 0, 0, + "MAX-NUM-FRAG-MSDU-%d", UNLIMITED, "UNLIMITED"); + + PFLAG_RANGE_DEFAULT(MAC, 1, MIN_FRAG_SIZE, 128, 0, -1, + "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED"); + PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0, + "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN"); + PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1, + "MULTI-TID-AGG-RX-QOS-%d"); + + if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) { + switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) { + case 0: + PRINT("LINK-ADAPTATION-NO-FEEDBACK"); + break; + case 1: + PRINT("LINK-ADAPTATION-RESERVED"); + break; + case 2: + PRINT("LINK-ADAPTATION-UNSOLICITED-FEEDBACK"); + break; + case 3: + PRINT("LINK-ADAPTATION-BOTH"); + break; + } + } + + PFLAG(MAC, 2, ALL_ACK, "ALL-ACK"); + PFLAG(MAC, 2, TRS, "TRS"); + PFLAG(MAC, 2, BSR, "BSR"); + PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT"); + PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP"); + PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING"); + PFLAG(MAC, 2, ACK_EN, "ACK-EN"); + + PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL"); + PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA"); + + switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) { + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT: + PRINT("MAX-AMPDU-LEN-EXP-USE-VHT"); + break; + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1: + PRINT("MAX-AMPDU-LEN-EXP-VHT-1"); + break; + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2: + PRINT("MAX-AMPDU-LEN-EXP-VHT-2"); + break; + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED: + PRINT("MAX-AMPDU-LEN-EXP-RESERVED"); + break; + } + + PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG"); + PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED"); + PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS"); + + PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG"); + PFLAG(MAC, 4, QTP, "QTP"); + PFLAG(MAC, 4, BQR, "BQR"); + PFLAG(MAC, 4, SRP_RESP, "SRP-RESP"); + PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); + PFLAG(MAC, 4, OPS, "OPS"); + PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU"); + + PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); + + PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION, + "SUBCHAN-SELECVITE-TRANSMISSION"); + PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU"); + PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX"); + + cap = hec->he_cap_elem.phy_cap_info; + p += scnprintf(p, buf_sz + buf - p, + "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", + cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6], + cap[7], cap[8], cap[9], cap[10]); + + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G, + "CHANNEL-WIDTH-SET-40MHZ-IN-2G"); + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G, + "CHANNEL-WIDTH-SET-40MHZ-80MHZ-IN-5G"); + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_160MHZ_IN_5G, + "CHANNEL-WIDTH-SET-160MHZ-IN-5G"); + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, + "CHANNEL-WIDTH-SET-80PLUS80-MHZ-IN-5G"); + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G, + "CHANNEL-WIDTH-SET-RU-MAPPING-IN-2G"); + PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G, + "CHANNEL-WIDTH-SET-RU-MAPPING-IN-5G"); + + switch (cap[1] & IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK) { + case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ: + PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-20MHZ"); + break; + case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ: + PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-40MHZ"); + break; + case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ: + PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-20MHZ"); + break; + case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ: + PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-40MHZ"); + break; + } + + PFLAG(PHY, 1, DEVICE_CLASS_A, + "IEEE80211-HE-PHY-CAP1-DEVICE-CLASS-A"); + PFLAG(PHY, 1, LDPC_CODING_IN_PAYLOAD, + "LDPC-CODING-IN-PAYLOAD"); + PFLAG(PHY, 1, HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US, + "HY-CAP1-HE-LTF-AND-GI-FOR-HE-PPDUS-0-8US"); + PRINT("MIDAMBLE-RX-MAX-NSTS-%d", ((cap[2] << 1) | (cap[1] >> 7)) & 0x3); + + PFLAG(PHY, 2, NDP_4x_LTF_AND_3_2US, "NDP-4X-LTF-AND-3-2US"); + PFLAG(PHY, 2, STBC_TX_UNDER_80MHZ, "STBC-TX-UNDER-80MHZ"); + PFLAG(PHY, 2, STBC_RX_UNDER_80MHZ, "STBC-RX-UNDER-80MHZ"); + PFLAG(PHY, 2, DOPPLER_TX, "DOPPLER-TX"); + PFLAG(PHY, 2, DOPPLER_RX, "DOPPLER-RX"); + PFLAG(PHY, 2, UL_MU_FULL_MU_MIMO, "UL-MU-FULL-MU-MIMO"); + PFLAG(PHY, 2, UL_MU_PARTIAL_MU_MIMO, "UL-MU-PARTIAL-MU-MIMO"); + + switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK) { + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM: + PRINT("DCM-MAX-CONST-TX-NO-DCM"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK: + PRINT("DCM-MAX-CONST-TX-BPSK"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK: + PRINT("DCM-MAX-CONST-TX-QPSK"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM: + PRINT("DCM-MAX-CONST-TX-16-QAM"); + break; + } + + PFLAG(PHY, 3, DCM_MAX_TX_NSS_1, "DCM-MAX-TX-NSS-1"); + PFLAG(PHY, 3, DCM_MAX_TX_NSS_2, "DCM-MAX-TX-NSS-2"); + + switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK) { + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM: + PRINT("DCM-MAX-CONST-RX-NO-DCM"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK: + PRINT("DCM-MAX-CONST-RX-BPSK"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK: + PRINT("DCM-MAX-CONST-RX-QPSK"); + break; + case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM: + PRINT("DCM-MAX-CONST-RX-16-QAM"); + break; + } + + PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1"); + PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2"); + PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA, + "RX-HE-MU-PPDU-FROM-NON-AP-STA"); + PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER"); + + PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE"); + PFLAG(PHY, 4, MU_BEAMFORMER, "MU-BEAMFORMER"); + + PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_UNDER_80MHZ, 0, 1, 4, + "BEAMFORMEE-MAX-STS-UNDER-%d"); + PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_ABOVE_80MHZ, 0, 1, 4, + "BEAMFORMEE-MAX-STS-ABOVE-%d"); + + PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ, 0, 1, 1, + "NUM-SND-DIM-UNDER-80MHZ-%d"); + PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ, 0, 1, 1, + "NUM-SND-DIM-ABOVE-80MHZ-%d"); + PFLAG(PHY, 5, NG16_SU_FEEDBACK, "NG16-SU-FEEDBACK"); + PFLAG(PHY, 5, NG16_MU_FEEDBACK, "NG16-MU-FEEDBACK"); + + PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU"); + PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU"); + PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB"); + PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB"); + PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB"); + PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE"); + PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO, + "PARTIAL-BANDWIDTH-DL-MUMIMO"); + PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT"); + + PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR"); + PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR"); + PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI, + "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI"); + PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d"); + PFLAG(PHY, 7, STBC_TX_ABOVE_80MHZ, "STBC-TX-ABOVE-80MHZ"); + PFLAG(PHY, 7, STBC_RX_ABOVE_80MHZ, "STBC-RX-ABOVE-80MHZ"); + + PFLAG(PHY, 8, HE_ER_SU_PPDU_4XLTF_AND_08_US_GI, + "HE-ER-SU-PPDU-4XLTF-AND-08-US-GI"); + PFLAG(PHY, 8, 20MHZ_IN_40MHZ_HE_PPDU_IN_2G, + "20MHZ-IN-40MHZ-HE-PPDU-IN-2G"); + PFLAG(PHY, 8, 20MHZ_IN_160MHZ_HE_PPDU, "20MHZ-IN-160MHZ-HE-PPDU"); + PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU"); + PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI, + "HE-ER-SU-1XLTF-AND-08-US-GI"); + PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF, + "MIDAMBLE-RX-TX-2X-AND-1XLTF"); + + switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_MASK) { + case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_20MHZ: + PRINT("DDCM-MAX-BW-20MHZ"); + break; + case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_40MHZ: + PRINT("DCM-MAX-BW-40MHZ"); + break; + case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_80MHZ: + PRINT("DCM-MAX-BW-80MHZ"); + break; + case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ: + PRINT("DCM-MAX-BW-160-OR-80P80-MHZ"); + break; + } + + PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM, + "LONGER-THAN-16-SIGB-OFDM-SYM"); + PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK, + "NON-TRIGGERED-CQI-FEEDBACK"); + PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU, + "TX-1024-QAM-LESS-THAN-242-TONE-RU"); + PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU, + "RX-1024-QAM-LESS-THAN-242-TONE-RU"); + PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB, + "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB"); + PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB, + "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB"); + +#undef PFLAG_RANGE_DEFAULT +#undef PFLAG_RANGE +#undef PFLAG + +#define PRINT_NSS_SUPP(f, n) \ + do { \ + int i; \ + u16 v = le16_to_cpu(nss->f); \ + p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \ + for (i = 0; i < 8; i += 2) { \ + switch ((v >> i) & 0x3) { \ + case 0: \ + PRINT(n "-%d-SUPPORT-0-7", i / 2); \ + break; \ + case 1: \ + PRINT(n "-%d-SUPPORT-0-9", i / 2); \ + break; \ + case 2: \ + PRINT(n "-%d-SUPPORT-0-11", i / 2); \ + break; \ + case 3: \ + PRINT(n "-%d-NOT-SUPPORTED", i / 2); \ + break; \ + } \ + } \ + } while (0) + + PRINT_NSS_SUPP(rx_mcs_80, "RX-MCS-80"); + PRINT_NSS_SUPP(tx_mcs_80, "TX-MCS-80"); + + if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) { + PRINT_NSS_SUPP(rx_mcs_160, "RX-MCS-160"); + PRINT_NSS_SUPP(tx_mcs_160, "TX-MCS-160"); + } + + if (cap[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) { + PRINT_NSS_SUPP(rx_mcs_80p80, "RX-MCS-80P80"); + PRINT_NSS_SUPP(tx_mcs_80p80, "TX-MCS-80P80"); + } + +#undef PRINT_NSS_SUPP +#undef PRINT + + if (!(cap[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)) + goto out; + + p += scnprintf(p, buf_sz + buf - p, "PPE-THRESHOLDS: %#.2x", + hec->ppe_thres[0]); + + ppe_size = ieee80211_he_ppe_size(hec->ppe_thres[0], cap); + for (i = 1; i < ppe_size; i++) { + p += scnprintf(p, buf_sz + buf - p, " %#.2x", + hec->ppe_thres[i]); + } + p += scnprintf(p, buf_sz + buf - p, "\n"); + +out: + ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return ret; +} +STA_OPS(he_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -538,6 +897,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(agg_status); DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); + DEBUGFS_ADD(he_capa); DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 8f6998091d26..0b1747a2313d 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1173,6 +1173,32 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, local->ops->wake_tx_queue(&local->hw, &txq->txq); } +static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local, + struct sk_buff *head, + struct sk_buff *skb) +{ + if (!local->ops->can_aggregate_in_amsdu) + return true; + + return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb); +} + +static inline int +drv_get_ftm_responder_stats(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_ftm_responder_stats *ftm_stats) +{ + u32 ret = -EOPNOTSUPP; + + if (local->ops->get_ftm_responder_stats) + ret = local->ops->get_ftm_responder_stats(&local->hw, + &sdata->vif, + ftm_stats); + trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats); + + return ret; +} + static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index f0f5fedb8caa..0d704e8d7078 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1070,7 +1070,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; - ieee80211_chandef_vht_oper(elems->vht_operation, + ieee80211_chandef_vht_oper(&local->hw, + elems->vht_operation, + elems->ht_operation, &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 172aeae21ae9..10a05062e4a0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -377,6 +377,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done; + bool peer_confirmed; bool timeout_started; u16 sae_trans, sae_status; @@ -818,6 +819,7 @@ enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, IEEE80211_TXQ_NO_AMSDU, + IEEE80211_TXQ_STOP_NETIF_TX, }; /** @@ -1198,6 +1200,9 @@ struct ieee80211_local { /* number of RX chains the hardware has */ u8 rx_chains; + /* bitmap of which sbands were copied */ + u8 sband_allocated; + int tx_headroom; /* required headroom for hardware/radiotap */ /* Tasklet and skb queue to process calls from IRQ mode. All frames @@ -1226,6 +1231,7 @@ struct ieee80211_local { struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; struct tasklet_struct tx_pending_tasklet; + struct tasklet_struct wake_txqs_tasklet; atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; @@ -2038,6 +2044,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi); +void ieee80211_wake_txqs(unsigned long data); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, @@ -2106,7 +2113,9 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, + const struct ieee80211_vht_operation *oper, + const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5e6cf2cee965..5836ddeac9e3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1756,7 +1756,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->ops->wake_tx_queue && type != NL80211_IFTYPE_AP_VLAN && - type != NL80211_IFTYPE_MONITOR) + (type != NL80211_IFTYPE_MONITOR || + (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index c054ac85793c..4700718e010f 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -248,6 +248,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; ret = drv_set_key(key->local, DISABLE_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); @@ -256,8 +257,65 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) "failed to remove key (%d, %pM) from hardware (%d)\n", key->conf.keyidx, sta ? sta->sta.addr : bcast_addr, ret); +} - key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; +static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, + struct ieee80211_key *new_key, + bool ptk0rekey) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_local *local; + struct sta_info *sta; + int ret; + + /* Aggregation sessions are OK when running on SW crypto. + * A broken remote STA may cause issues not observed with HW + * crypto, though. + */ + if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + return 0; + + assert_key_lock(old_key->local); + sta = old_key->sta; + + /* PTK only using key ID 0 needs special handling on rekey */ + if (new_key && sta && ptk0rekey) { + local = old_key->local; + sdata = old_key->sdata; + + /* Stop TX till we are on the new key */ + old_key->flags |= KEY_FLAG_TAINTED; + ieee80211_clear_fast_xmit(sta); + + /* Aggregation sessions during rekey are complicated due to the + * reorder buffer and retransmits. Side step that by blocking + * aggregation during rekey and tear down running sessions. + */ + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { + set_sta_flag(sta, WLAN_STA_BLOCK_BA); + ieee80211_sta_tear_down_BA_sessions(sta, + AGG_STOP_LOCAL_REQUEST); + } + + if (!wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { + pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", + sta->sta.addr); + /* Flushing the driver queues *may* help prevent + * the clear text leaks and freezes. + */ + ieee80211_flush_queues(local, sdata, false); + } + } + + ieee80211_key_disable_hw_accel(old_key); + + if (new_key) + ret = ieee80211_key_enable_hw_accel(new_key); + else + ret = 0; + + return ret; } static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, @@ -316,38 +374,57 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, } -static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, +static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, bool pairwise, struct ieee80211_key *old, struct ieee80211_key *new) { int idx; + int ret; bool defunikey, defmultikey, defmgmtkey; /* caller must provide at least one old/new */ if (WARN_ON(!new && !old)) - return; + return 0; if (new) list_add_tail_rcu(&new->list, &sdata->key_list); WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); - if (old) + if (old) { idx = old->conf.keyidx; - else + /* TODO: proper implement and test "Extended Key ID for + * Individually Addressed Frames" from IEEE 802.11-2016. + * Till then always assume only key ID 0 is used for + * pairwise keys.*/ + ret = ieee80211_hw_key_replace(old, new, pairwise); + } else { + /* new must be provided in case old is not */ idx = new->conf.keyidx; + if (!new->local->wowlan) + ret = ieee80211_key_enable_hw_accel(new); + else + ret = 0; + } + + if (ret) + return ret; if (sta) { if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); sta->ptk_idx = idx; - ieee80211_check_fast_xmit(sta); + if (new) { + clear_sta_flag(sta, WLAN_STA_BLOCK_BA); + ieee80211_check_fast_xmit(sta); + } } else { rcu_assign_pointer(sta->gtk[idx], new); } - ieee80211_check_fast_rx(sta); + if (new) + ieee80211_check_fast_rx(sta); } else { defunikey = old && old == key_mtx_dereference(sdata->local, @@ -380,6 +457,8 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (old) list_del_rcu(&old->list); + + return 0; } struct ieee80211_key * @@ -575,9 +654,6 @@ static void ieee80211_key_free_common(struct ieee80211_key *key) static void __ieee80211_key_destroy(struct ieee80211_key *key, bool delay_tailroom) { - if (key->local) - ieee80211_key_disable_hw_accel(key); - if (key->local) { struct ieee80211_sub_if_data *sdata = key->sdata; @@ -654,7 +730,6 @@ int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { - struct ieee80211_local *local = sdata->local; struct ieee80211_key *old_key; int idx = key->conf.keyidx; bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; @@ -691,17 +766,13 @@ int ieee80211_key_link(struct ieee80211_key *key, increment_tailroom_need_count(sdata); - ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - ieee80211_key_destroy(old_key, delay_tailroom); - - ieee80211_debugfs_key_add(key); + ret = ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - if (!local->wowlan) { - ret = ieee80211_key_enable_hw_accel(key); - if (ret) - ieee80211_key_free(key, delay_tailroom); + if (!ret) { + ieee80211_debugfs_key_add(key); + ieee80211_key_destroy(old_key, delay_tailroom); } else { - ret = 0; + ieee80211_key_free(key, delay_tailroom); } out: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 513627896204..83e71e6b2ebe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -610,6 +611,18 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->ops = ops; local->use_chanctx = use_chanctx; + /* + * We need a bit of data queued to build aggregates properly, so + * instruct the TCP stack to allow more than a single ms of data + * to be queued in the stack. The value is a bit-shift of 1 + * second, so 8 is ~4ms of queued data. Only affects local TCP + * sockets. + * This is the default, anyhow - drivers may need to override it + * for local reasons (longer buffers, longer completion time, or + * similar). + */ + local->hw.tx_sk_pacing_shift = 8; + /* set up some defaults */ local->hw.queues = 1; local->hw.max_rates = 1; @@ -684,6 +697,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, (unsigned long)local); + if (ops->wake_tx_queue) + tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs, + (unsigned long)local); + tasklet_init(&local->tasklet, ieee80211_tasklet_handler, (unsigned long) local); @@ -1154,6 +1171,53 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_rate; } + if (local->rate_ctrl) { + clear_bit(IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, hw->flags); + if (local->rate_ctrl->ops->capa & RATE_CTRL_CAPA_VHT_EXT_NSS_BW) + ieee80211_hw_set(hw, SUPPORTS_VHT_EXT_NSS_BW); + } + + /* + * If the VHT capabilities don't have IEEE80211_VHT_EXT_NSS_BW_CAPABLE, + * or have it when we don't, copy the sband structure and set/clear it. + * This is necessary because rate scaling algorithms could be switched + * and have different support values. + * Print a message so that in the common case the reallocation can be + * avoided. + */ + BUILD_BUG_ON(NUM_NL80211_BANDS > 8 * sizeof(local->sband_allocated)); + for (band = 0; band < NUM_NL80211_BANDS; band++) { + struct ieee80211_supported_band *sband; + bool local_cap, ie_cap; + + local_cap = ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW); + + sband = local->hw.wiphy->bands[band]; + if (!sband || !sband->vht_cap.vht_supported) + continue; + + ie_cap = !!(sband->vht_cap.vht_mcs.tx_highest & + cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)); + + if (local_cap == ie_cap) + continue; + + sband = kmemdup(sband, sizeof(*sband), GFP_KERNEL); + if (!sband) { + result = -ENOMEM; + goto fail_rate; + } + + wiphy_dbg(hw->wiphy, "copying sband (band %d) due to VHT EXT NSS BW flag\n", + band); + + sband->vht_cap.vht_mcs.tx_highest ^= + cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); + + local->hw.wiphy->bands[band] = sband; + local->sband_allocated |= BIT(band); + } + /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && !ieee80211_hw_check(hw, NO_AUTO_VIF)) { @@ -1272,6 +1336,7 @@ static int ieee80211_free_ack_frame(int id, void *p, void *data) void ieee80211_free_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); + enum nl80211_band band; mutex_destroy(&local->iflist_mtx); mutex_destroy(&local->mtx); @@ -1287,6 +1352,12 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) ieee80211_free_led_names(local); + for (band = 0; band < NUM_NL80211_BANDS; band++) { + if (!(local->sband_allocated & BIT(band))) + continue; + kfree(local->hw.wiphy->bands[band]); + } + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); @@ -1304,18 +1375,12 @@ static int __init ieee80211_init(void) if (ret) return ret; - ret = rc80211_minstrel_ht_init(); - if (ret) - goto err_minstrel; - ret = ieee80211_iface_init(); if (ret) goto err_netdev; return 0; err_netdev: - rc80211_minstrel_ht_exit(); - err_minstrel: rc80211_minstrel_exit(); return ret; @@ -1323,7 +1388,6 @@ static int __init ieee80211_init(void) static void __exit ieee80211_exit(void) { - rc80211_minstrel_ht_exit(); rc80211_minstrel_exit(); ieee80211s_stop(); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index d51da26e9c18..8bad414c52ad 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008, 2009 open80211s Ltd. + * Copyright (C) 2018 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona * @@ -98,7 +99,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); - ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def); + ieee80211_chandef_vht_oper(&sdata->local->hw, + ie->vht_operation, ie->ht_operation, + &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index ee56f18cad3f..21526630bf65 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -217,7 +217,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, + struct ieee80211_tx_status *st); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index daf9db3c8f24..6950cd0bf594 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -295,15 +295,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, } void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, + struct ieee80211_tx_status *st) { - struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *txinfo = st->info; int failed; - if (!ieee80211_is_data(hdr->frame_control)) - return; - failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100. diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3dbecae4be73..d2bc8d57c87e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -220,7 +220,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); - if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + &he_oper_vht_cap, ht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) sdata_info(sdata, @@ -228,7 +229,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HE; goto out; } - } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { + } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper, + ht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -2759,13 +2761,40 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, auth_data->key_idx, tx_flags); } +static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, + const u8 *bssid) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct sta_info *sta; + + sdata_info(sdata, "authenticated\n"); + ifmgd->auth_data->done = true; + ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; + ifmgd->auth_data->timeout_started = true; + run_again(sdata, ifmgd->auth_data->timeout); + + /* move station state to auth */ + mutex_lock(&sdata->local->sta_mtx); + sta = sta_info_get(sdata, bssid); + if (!sta) { + WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid); + return false; + } + if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { + sdata_info(sdata, "failed moving %pM to auth\n", bssid); + return false; + } + mutex_unlock(&sdata->local->sta_mtx); + + return true; +} + static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 bssid[ETH_ALEN]; u16 auth_alg, auth_transaction, status_code; - struct sta_info *sta; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, @@ -2789,7 +2818,11 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, status_code = le16_to_cpu(mgmt->u.auth.status_code); if (auth_alg != ifmgd->auth_data->algorithm || - auth_transaction != ifmgd->auth_data->expected_transaction) { + (auth_alg != WLAN_AUTH_SAE && + auth_transaction != ifmgd->auth_data->expected_transaction) || + (auth_alg == WLAN_AUTH_SAE && + (auth_transaction < ifmgd->auth_data->expected_transaction || + auth_transaction > 2))) { sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n", mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, @@ -2832,35 +2865,17 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); - sdata_info(sdata, "authenticated\n"); - ifmgd->auth_data->done = true; - ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; - ifmgd->auth_data->timeout_started = true; - run_again(sdata, ifmgd->auth_data->timeout); - - if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && - ifmgd->auth_data->expected_transaction != 2) { - /* - * Report auth frame to user space for processing since another - * round of Authentication frames is still needed. - */ - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); - return; + if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || + (auth_transaction == 2 && + ifmgd->auth_data->expected_transaction == 2)) { + if (!ieee80211_mark_sta_auth(sdata, bssid)) + goto out_err; + } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && + auth_transaction == 2) { + sdata_info(sdata, "SAE peer confirmed\n"); + ifmgd->auth_data->peer_confirmed = true; } - /* move station state to auth */ - mutex_lock(&sdata->local->sta_mtx); - sta = sta_info_get(sdata, bssid); - if (!sta) { - WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid); - goto out_err; - } - if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { - sdata_info(sdata, "failed moving %pM to auth\n", bssid); - goto out_err; - } - mutex_unlock(&sdata->local->sta_mtx); - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); return; out_err: @@ -3237,19 +3252,16 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } if (bss_conf->he_support) { - u32 he_oper_params = - le32_to_cpu(elems.he_operation->he_oper_params); + bss_conf->bss_color = + le32_get_bits(elems.he_operation->he_oper_params, + IEEE80211_HE_OPERATION_BSS_COLOR_MASK); - bss_conf->bss_color = he_oper_params & - IEEE80211_HE_OPERATION_BSS_COLOR_MASK; bss_conf->htc_trig_based_pkt_ext = - (he_oper_params & - IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << - IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET; + le32_get_bits(elems.he_operation->he_oper_params, + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = - (he_oper_params & - IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << - IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET; + le32_get_bits(elems.he_operation->he_oper_params, + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->multi_sta_back_32bit = sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & @@ -4879,6 +4891,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_auth_data *auth_data; u16 auth_alg; int err; + bool cont_auth; /* prepare auth data structure */ @@ -4913,6 +4926,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, return -EOPNOTSUPP; } + if (ifmgd->assoc_data) + return -EBUSY; + auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + req->ie_len, GFP_KERNEL); if (!auth_data) @@ -4932,6 +4948,13 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, auth_data->data_len += req->auth_data_len - 4; } + /* Check if continuing authentication or trying to authenticate with the + * same BSS that we were in the process of authenticating with and avoid + * removal and re-addition of the STA entry in + * ieee80211_prep_connection(). + */ + cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss; + if (req->ie && req->ie_len) { memcpy(&auth_data->data[auth_data->data_len], req->ie, req->ie_len); @@ -4948,18 +4971,26 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, /* try to authenticate/probe */ - if ((ifmgd->auth_data && !ifmgd->auth_data->done) || - ifmgd->assoc_data) { - err = -EBUSY; - goto err_free; + if (ifmgd->auth_data) { + if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE) { + auth_data->peer_confirmed = + ifmgd->auth_data->peer_confirmed; + } + ieee80211_destroy_auth_data(sdata, cont_auth); } - if (ifmgd->auth_data) - ieee80211_destroy_auth_data(sdata, false); - /* prep auth_data so we don't go into idle on disassoc */ ifmgd->auth_data = auth_data; + /* If this is continuation of an ongoing SAE authentication exchange + * (i.e., request to send SAE Confirm) and the peer has already + * confirmed, mark authentication completed since we are about to send + * out SAE Confirm. + */ + if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE && + auth_data->peer_confirmed && auth_data->sae_trans == 2) + ieee80211_mark_sta_auth(sdata, req->bss->bssid); + if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; @@ -4977,7 +5008,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); - err = ieee80211_prep_connection(sdata, req->bss, false, false); + err = ieee80211_prep_connection(sdata, req->bss, cont_auth, false); if (err) goto err_clear; @@ -4998,7 +5029,6 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->mtx); - err_free: kfree(auth_data); return err; } diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 8212bfeb71d6..d59198191a79 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -95,18 +95,5 @@ static inline void rc80211_minstrel_exit(void) } #endif -#ifdef CONFIG_MAC80211_RC_MINSTREL_HT -int rc80211_minstrel_ht_init(void); -void rc80211_minstrel_ht_exit(void); -#else -static inline int rc80211_minstrel_ht_init(void) -{ - return 0; -} -static inline void rc80211_minstrel_ht_exit(void) -{ -} -#endif - #endif /* IEEE80211_RATE_H */ diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 07fb219327d6..a34e9c2ca626 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -167,12 +167,6 @@ minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) if (unlikely(!mrs->att_hist)) { mrs->prob_ewma = cur_prob; } else { - /* update exponential weighted moving variance */ - mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv, - cur_prob, - mrs->prob_ewma, - EWMA_LEVEL); - /*update exponential weighted moving avarage */ mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, cur_prob, @@ -572,141 +566,6 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, minstrel_update_rates(mp, mi); } -static void * -minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) -{ - struct ieee80211_supported_band *sband; - struct minstrel_sta_info *mi; - struct minstrel_priv *mp = priv; - struct ieee80211_hw *hw = mp->hw; - int max_rates = 0; - int i; - - mi = kzalloc(sizeof(struct minstrel_sta_info), gfp); - if (!mi) - return NULL; - - for (i = 0; i < NUM_NL80211_BANDS; i++) { - sband = hw->wiphy->bands[i]; - if (sband && sband->n_bitrates > max_rates) - max_rates = sband->n_bitrates; - } - - mi->r = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp); - if (!mi->r) - goto error; - - mi->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp); - if (!mi->sample_table) - goto error1; - - mi->last_stats_update = jiffies; - return mi; - -error1: - kfree(mi->r); -error: - kfree(mi); - return NULL; -} - -static void -minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) -{ - struct minstrel_sta_info *mi = priv_sta; - - kfree(mi->sample_table); - kfree(mi->r); - kfree(mi); -} - -static void -minstrel_init_cck_rates(struct minstrel_priv *mp) -{ - static const int bitrates[4] = { 10, 20, 55, 110 }; - struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); - int i, j; - - sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; - if (!sband) - return; - - for (i = 0, j = 0; i < sband->n_bitrates; i++) { - struct ieee80211_rate *rate = &sband->bitrates[i]; - - if (rate->flags & IEEE80211_RATE_ERP_G) - continue; - - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - - for (j = 0; j < ARRAY_SIZE(bitrates); j++) { - if (rate->bitrate != bitrates[j]) - continue; - - mp->cck_rates[j] = i; - break; - } - } -} - -static void * -minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) -{ - struct minstrel_priv *mp; - - mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); - if (!mp) - return NULL; - - /* contention window settings - * Just an approximation. Using the per-queue values would complicate - * the calculations and is probably unnecessary */ - mp->cw_min = 15; - mp->cw_max = 1023; - - /* number of packets (in %) to use for sampling other rates - * sample less often for non-mrr packets, because the overhead - * is much higher than with mrr */ - mp->lookaround_rate = 5; - mp->lookaround_rate_mrr = 10; - - /* maximum time that the hw is allowed to stay in one MRR segment */ - mp->segment_size = 6000; - - if (hw->max_rate_tries > 0) - mp->max_retry = hw->max_rate_tries; - else - /* safe default, does not necessarily have to match hw properties */ - mp->max_retry = 7; - - if (hw->max_rates >= 4) - mp->has_mrr = true; - - mp->hw = hw; - mp->update_interval = 100; - -#ifdef CONFIG_MAC80211_DEBUGFS - mp->fixed_rate_idx = (u32) -1; - mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx", - 0666, debugfsdir, &mp->fixed_rate_idx); -#endif - - minstrel_init_cck_rates(mp); - - return mp; -} - -static void -minstrel_free(void *priv) -{ -#ifdef CONFIG_MAC80211_DEBUGFS - debugfs_remove(((struct minstrel_priv *)priv)->dbg_fixed_rate); -#endif - kfree(priv); -} - static u32 minstrel_get_expected_throughput(void *priv_sta) { struct minstrel_sta_info *mi = priv_sta; @@ -725,29 +584,8 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) } const struct rate_control_ops mac80211_minstrel = { - .name = "minstrel", .tx_status_ext = minstrel_tx_status, .get_rate = minstrel_get_rate, .rate_init = minstrel_rate_init, - .alloc = minstrel_alloc, - .free = minstrel_free, - .alloc_sta = minstrel_alloc_sta, - .free_sta = minstrel_free_sta, -#ifdef CONFIG_MAC80211_DEBUGFS - .add_sta_debugfs = minstrel_add_sta_debugfs, - .remove_sta_debugfs = minstrel_remove_sta_debugfs, -#endif .get_expected_throughput = minstrel_get_expected_throughput, }; - -int __init -rc80211_minstrel_init(void) -{ - return ieee80211_rate_control_register(&mac80211_minstrel); -} - -void -rc80211_minstrel_exit(void) -{ - ieee80211_rate_control_unregister(&mac80211_minstrel); -} diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index be6c3f35f48b..23ec953e3a24 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -35,19 +35,6 @@ minstrel_ewma(int old, int new, int weight) return old + incr; } -/* - * Perform EWMV (Exponentially Weighted Moving Variance) calculation - */ -static inline int -minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight) -{ - int diff, incr; - - diff = cur_prob - prob_ewma; - incr = (EWMA_DIV - weight) * diff / EWMA_DIV; - return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV; -} - struct minstrel_rate_stats { /* current / last sampling period attempts/success counters */ u16 attempts, last_attempts; @@ -56,11 +43,8 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u32 att_hist, succ_hist; - /* statistis of packet delivery probability - * prob_ewma - exponential weighted moving average of prob - * prob_ewmsd - exp. weighted moving standard deviation of prob */ + /* prob_ewma - exponential weighted moving average of prob */ u16 prob_ewma; - u16 prob_ewmv; /* maximum retry counts */ u8 retry_count; @@ -109,11 +93,6 @@ struct minstrel_sta_info { /* sampling table */ u8 *sample_table; - -#ifdef CONFIG_MAC80211_DEBUGFS - struct dentry *dbg_stats; - struct dentry *dbg_stats_csv; -#endif }; struct minstrel_priv { @@ -137,7 +116,6 @@ struct minstrel_priv { * - setting will be applied on next update */ u32 fixed_rate_idx; - struct dentry *dbg_fixed_rate; #endif }; @@ -146,17 +124,8 @@ struct minstrel_debugfs_info { char buf[]; }; -/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */ -static inline int -minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs) -{ - unsigned int ewmv = mrs->prob_ewmv; - return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000)); -} - extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); -void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); /* Recalculate success probabilities and counters for a given rate using EWMA */ void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); @@ -165,7 +134,5 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); int minstrel_stats_csv_open(struct inode *inode, struct file *file); -ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); -int minstrel_stats_release(struct inode *inode, struct file *file); #endif diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 9ad7d63d3e5b..c8afd85b51a0 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -54,22 +54,6 @@ #include #include "rc80211_minstrel.h" -ssize_t -minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) -{ - struct minstrel_debugfs_info *ms; - - ms = file->private_data; - return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len); -} - -int -minstrel_stats_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - int minstrel_stats_open(struct inode *inode, struct file *file) { @@ -86,14 +70,13 @@ minstrel_stats_open(struct inode *inode, struct file *file) p = ms->buf; p += sprintf(p, "\n"); p += sprintf(p, - "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n"); + "best __________rate_________ ____statistics___ ____last_____ ______sum-of________\n"); p += sprintf(p, - "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); + "rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; - unsigned int prob_ewmsd; *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -109,15 +92,13 @@ minstrel_stats_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -135,14 +116,6 @@ minstrel_stats_open(struct inode *inode, struct file *file) return 0; } -static const struct file_operations minstrel_stat_fops = { - .owner = THIS_MODULE, - .open = minstrel_stats_open, - .read = minstrel_stats_read, - .release = minstrel_stats_release, - .llseek = default_llseek, -}; - int minstrel_stats_csv_open(struct inode *inode, struct file *file) { @@ -161,7 +134,6 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; - unsigned int prob_ewmsd; p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); @@ -177,14 +149,12 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -200,33 +170,3 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) return 0; } - -static const struct file_operations minstrel_stat_csv_fops = { - .owner = THIS_MODULE, - .open = minstrel_stats_csv_open, - .read = minstrel_stats_read, - .release = minstrel_stats_release, - .llseek = default_llseek, -}; - -void -minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) -{ - struct minstrel_sta_info *mi = priv_sta; - - mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi, - &minstrel_stat_fops); - - mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi, - &minstrel_stat_csv_fops); -} - -void -minstrel_remove_sta_debugfs(void *priv, void *priv_sta) -{ - struct minstrel_sta_info *mi = priv_sta; - - debugfs_remove(mi->dbg_stats); - - debugfs_remove(mi->dbg_stats_csv); -} diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 67ebdeaffbbc..f466ec37d161 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -52,22 +52,23 @@ _streams - 1 /* MCS rate information for an MCS group */ -#define MCS_GROUP(_streams, _sgi, _ht40) \ +#define MCS_GROUP(_streams, _sgi, _ht40, _s) \ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ + .shift = _s, \ .flags = \ IEEE80211_TX_RC_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ - MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234), \ - MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) \ + MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \ + MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \ } \ } @@ -80,9 +81,10 @@ #define BW2VBPS(_bw, r3, r2, r1) \ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) -#define VHT_GROUP(_streams, _sgi, _bw) \ +#define VHT_GROUP(_streams, _sgi, _bw, _s) \ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ + .shift = _s, \ .flags = \ IEEE80211_TX_RC_VHT_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ @@ -90,25 +92,25 @@ _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 117, 54, 26)), \ + BW2VBPS(_bw, 117, 54, 26)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 234, 108, 52)), \ + BW2VBPS(_bw, 234, 108, 52)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 351, 162, 78)), \ + BW2VBPS(_bw, 351, 162, 78)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 468, 216, 104)), \ + BW2VBPS(_bw, 468, 216, 104)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 702, 324, 156)), \ + BW2VBPS(_bw, 702, 324, 156)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 936, 432, 208)), \ + BW2VBPS(_bw, 936, 432, 208)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 1053, 486, 234)), \ + BW2VBPS(_bw, 1053, 486, 234)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 1170, 540, 260)), \ + BW2VBPS(_bw, 1170, 540, 260)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 1404, 648, 312)), \ + BW2VBPS(_bw, 1404, 648, 312)) >> _s, \ MCS_DURATION(_streams, _sgi, \ - BW2VBPS(_bw, 1560, 720, 346)) \ + BW2VBPS(_bw, 1560, 720, 346)) >> _s \ } \ } @@ -121,28 +123,27 @@ (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \ CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE)) -#define CCK_DURATION_LIST(_short) \ - CCK_ACK_DURATION(10, _short), \ - CCK_ACK_DURATION(20, _short), \ - CCK_ACK_DURATION(55, _short), \ - CCK_ACK_DURATION(110, _short) +#define CCK_DURATION_LIST(_short, _s) \ + CCK_ACK_DURATION(10, _short) >> _s, \ + CCK_ACK_DURATION(20, _short) >> _s, \ + CCK_ACK_DURATION(55, _short) >> _s, \ + CCK_ACK_DURATION(110, _short) >> _s -#define CCK_GROUP \ +#define CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ - .streams = 0, \ + .streams = 1, \ .flags = 0, \ + .shift = _s, \ .duration = { \ - CCK_DURATION_LIST(false), \ - CCK_DURATION_LIST(true) \ + CCK_DURATION_LIST(false, _s), \ + CCK_DURATION_LIST(true, _s) \ } \ } -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); MODULE_PARM_DESC(minstrel_vht_only, "Use only VHT rates when VHT is supported by sta."); -#endif /* * To enable sufficiently targeted rate sampling, MCS rates are divided into @@ -153,49 +154,47 @@ MODULE_PARM_DESC(minstrel_vht_only, * BW -> SGI -> #streams */ const struct mcs_group minstrel_mcs_groups[] = { - MCS_GROUP(1, 0, BW_20), - MCS_GROUP(2, 0, BW_20), - MCS_GROUP(3, 0, BW_20), + MCS_GROUP(1, 0, BW_20, 5), + MCS_GROUP(2, 0, BW_20, 4), + MCS_GROUP(3, 0, BW_20, 4), - MCS_GROUP(1, 1, BW_20), - MCS_GROUP(2, 1, BW_20), - MCS_GROUP(3, 1, BW_20), + MCS_GROUP(1, 1, BW_20, 5), + MCS_GROUP(2, 1, BW_20, 4), + MCS_GROUP(3, 1, BW_20, 4), - MCS_GROUP(1, 0, BW_40), - MCS_GROUP(2, 0, BW_40), - MCS_GROUP(3, 0, BW_40), + MCS_GROUP(1, 0, BW_40, 4), + MCS_GROUP(2, 0, BW_40, 4), + MCS_GROUP(3, 0, BW_40, 4), - MCS_GROUP(1, 1, BW_40), - MCS_GROUP(2, 1, BW_40), - MCS_GROUP(3, 1, BW_40), + MCS_GROUP(1, 1, BW_40, 4), + MCS_GROUP(2, 1, BW_40, 4), + MCS_GROUP(3, 1, BW_40, 4), - CCK_GROUP, + CCK_GROUP(8), -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT - VHT_GROUP(1, 0, BW_20), - VHT_GROUP(2, 0, BW_20), - VHT_GROUP(3, 0, BW_20), + VHT_GROUP(1, 0, BW_20, 5), + VHT_GROUP(2, 0, BW_20, 4), + VHT_GROUP(3, 0, BW_20, 4), - VHT_GROUP(1, 1, BW_20), - VHT_GROUP(2, 1, BW_20), - VHT_GROUP(3, 1, BW_20), + VHT_GROUP(1, 1, BW_20, 5), + VHT_GROUP(2, 1, BW_20, 4), + VHT_GROUP(3, 1, BW_20, 4), - VHT_GROUP(1, 0, BW_40), - VHT_GROUP(2, 0, BW_40), - VHT_GROUP(3, 0, BW_40), + VHT_GROUP(1, 0, BW_40, 4), + VHT_GROUP(2, 0, BW_40, 4), + VHT_GROUP(3, 0, BW_40, 4), - VHT_GROUP(1, 1, BW_40), - VHT_GROUP(2, 1, BW_40), - VHT_GROUP(3, 1, BW_40), + VHT_GROUP(1, 1, BW_40, 4), + VHT_GROUP(2, 1, BW_40, 4), + VHT_GROUP(3, 1, BW_40, 4), - VHT_GROUP(1, 0, BW_80), - VHT_GROUP(2, 0, BW_80), - VHT_GROUP(3, 0, BW_80), + VHT_GROUP(1, 0, BW_80, 4), + VHT_GROUP(2, 0, BW_80, 4), + VHT_GROUP(3, 0, BW_80, 4), - VHT_GROUP(1, 1, BW_80), - VHT_GROUP(2, 1, BW_80), - VHT_GROUP(3, 1, BW_80), -#endif + VHT_GROUP(1, 1, BW_80, 4), + VHT_GROUP(2, 1, BW_80, 4), + VHT_GROUP(3, 1, BW_80, 4), }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; @@ -282,7 +281,8 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, break; /* short preamble */ - if (!(mi->supported[group] & BIT(idx))) + if ((mi->supported[group] & BIT(idx + 4)) && + (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) idx += 4; } return &mi->groups[group].rates[idx]; @@ -311,7 +311,8 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, if (group != MINSTREL_CCK_GROUP) nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); - nsecs += minstrel_mcs_groups[group].duration[rate]; + nsecs += minstrel_mcs_groups[group].duration[rate] << + minstrel_mcs_groups[group].shift; /* * For the throughput calculation, limit the probability value to 90% to @@ -759,12 +760,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, minstrel_ht_update_rates(mp, mi); } +static inline int +minstrel_get_duration(int index) +{ + const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + unsigned int duration = group->duration[index % MCS_GROUP_RATES]; + return duration << group->shift; +} + static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) { struct minstrel_rate_stats *mrs; - const struct mcs_group *group; unsigned int tx_time, tx_time_rtscts, tx_time_data; unsigned int cw = mp->cw_min; unsigned int ctime = 0; @@ -783,8 +791,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mrs->retry_count_rtscts = 2; mrs->retry_updated = true; - group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000; + tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000; /* Contention time for first 2 tries */ ctime = (t_slot * cw) >> 1; @@ -878,20 +885,24 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) int group = mi->max_prob_rate / MCS_GROUP_RATES; const struct mcs_group *g = &minstrel_mcs_groups[group]; int rate = mi->max_prob_rate % MCS_GROUP_RATES; + unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) return 1; + duration = g->duration[rate]; + duration <<= g->shift; + /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ - if (g->duration[rate] > MCS_DURATION(1, 0, 52)) + if (duration > MCS_DURATION(1, 0, 52)) return 500; /* * If the rate is slower than single-stream MCS4, limit A-MSDU to usual * data packet size */ - if (g->duration[rate] > MCS_DURATION(1, 0, 104)) + if (duration > MCS_DURATION(1, 0, 104)) return 1600; /* @@ -899,7 +910,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) * rate success probability is less than 75%, limit A-MSDU to twice the usual * data packet size */ - if (g->duration[rate] > MCS_DURATION(1, 0, 260) || + if (duration > MCS_DURATION(1, 0, 260) || (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; @@ -946,13 +957,6 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) rate_control_set_rates(mp->hw, mi->sta, rates); } -static inline int -minstrel_get_duration(int index) -{ - const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - return group->duration[index % MCS_GROUP_RATES]; -} - static int minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { @@ -1000,10 +1004,13 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) return -1; /* - * Do not sample if the probability is already higher than 95% - * to avoid wasting airtime. + * Do not sample if the probability is already higher than 95%, + * or if the rate is 3 times slower than the current max probability + * rate, to avoid wasting airtime. */ - if (mrs->prob_ewma > MINSTREL_FRAC(95, 100)) + sample_dur = minstrel_get_duration(sample_idx); + if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur) return -1; /* @@ -1013,7 +1020,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 / MCS_GROUP_RATES].streams; - sample_dur = minstrel_get_duration(sample_idx); if (sample_dur >= minstrel_get_duration(tp_rate2) && (cur_max_tp_streams - 1 < minstrel_mcs_groups[sample_group].streams || @@ -1077,18 +1083,23 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, return; sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES]; + sample_idx %= MCS_GROUP_RATES; + + if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] && + (sample_idx >= 4) != txrc->short_preamble) + return; + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; rate->count = 1; - if (sample_idx / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) { + if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); rate->idx = mp->cck_rates[idx]; } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES, sample_group->streams); } else { - rate->idx = sample_idx % MCS_GROUP_RATES + - (sample_group->streams - 1) * 8; + rate->idx = sample_idx + (sample_group->streams - 1) * 8; } rate->flags = sample_group->flags; @@ -1130,14 +1141,14 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct minstrel_ht_sta_priv *msp = priv_sta; struct minstrel_ht_sta *mi = &msp->ht; struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; - u16 sta_cap = sta->ht_cap.cap; + u16 ht_cap = sta->ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; - struct sta_info *sinfo = container_of(sta, struct sta_info, sta); int use_vht; int n_supported = 0; int ack_dur; int stbc; int i; + bool ldpc; /* fall back to the old minstrel for legacy stations */ if (!sta->ht_cap.ht_supported) @@ -1145,12 +1156,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT if (vht_cap->vht_supported) use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0); else -#endif - use_vht = 0; + use_vht = 0; msp->is_ht = true; memset(mi, 0, sizeof(*mi)); @@ -1175,16 +1184,22 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, } mi->sample_tries = 4; - /* TODO tx_flags for vht - ATM the RC API is not fine-grained enough */ if (!use_vht) { - stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >> + stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >> IEEE80211_HT_CAP_RX_STBC_SHIFT; - mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; - if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING) - mi->tx_flags |= IEEE80211_TX_CTL_LDPC; + ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING; + } else { + stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >> + IEEE80211_VHT_CAP_RXSTBC_SHIFT; + + ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC; } + mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; + if (ldpc) + mi->tx_flags |= IEEE80211_TX_CTL_LDPC; + for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; @@ -1197,10 +1212,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (gflags & IEEE80211_TX_RC_SHORT_GI) { if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { - if (!(sta_cap & IEEE80211_HT_CAP_SGI_40)) + if (!(ht_cap & IEEE80211_HT_CAP_SGI_40)) continue; } else { - if (!(sta_cap & IEEE80211_HT_CAP_SGI_20)) + if (!(ht_cap & IEEE80211_HT_CAP_SGI_20)) continue; } } @@ -1217,10 +1232,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, /* HT rate */ if (gflags & IEEE80211_TX_RC_MCS) { -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT if (use_vht && minstrel_vht_only) continue; -#endif + mi->supported[i] = mcs->rx_mask[nss - 1]; if (mi->supported[i]) n_supported++; @@ -1258,8 +1272,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (!n_supported) goto use_legacy; - if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE)) - mi->cck_supported_short |= mi->cck_supported_short << 4; + mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4; /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); @@ -1340,16 +1353,88 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) kfree(msp); } +static void +minstrel_ht_init_cck_rates(struct minstrel_priv *mp) +{ + static const int bitrates[4] = { 10, 20, 55, 110 }; + struct ieee80211_supported_band *sband; + u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); + int i, j; + + sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; + if (!sband) + return; + + for (i = 0; i < sband->n_bitrates; i++) { + struct ieee80211_rate *rate = &sband->bitrates[i]; + + if (rate->flags & IEEE80211_RATE_ERP_G) + continue; + + if ((rate_flags & sband->bitrates[i].flags) != rate_flags) + continue; + + for (j = 0; j < ARRAY_SIZE(bitrates); j++) { + if (rate->bitrate != bitrates[j]) + continue; + + mp->cck_rates[j] = i; + break; + } + } +} + static void * minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) { - return mac80211_minstrel.alloc(hw, debugfsdir); + struct minstrel_priv *mp; + + mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); + if (!mp) + return NULL; + + /* contention window settings + * Just an approximation. Using the per-queue values would complicate + * the calculations and is probably unnecessary */ + mp->cw_min = 15; + mp->cw_max = 1023; + + /* number of packets (in %) to use for sampling other rates + * sample less often for non-mrr packets, because the overhead + * is much higher than with mrr */ + mp->lookaround_rate = 5; + mp->lookaround_rate_mrr = 10; + + /* maximum time that the hw is allowed to stay in one MRR segment */ + mp->segment_size = 6000; + + if (hw->max_rate_tries > 0) + mp->max_retry = hw->max_rate_tries; + else + /* safe default, does not necessarily have to match hw properties */ + mp->max_retry = 7; + + if (hw->max_rates >= 4) + mp->has_mrr = true; + + mp->hw = hw; + mp->update_interval = 100; + +#ifdef CONFIG_MAC80211_DEBUGFS + mp->fixed_rate_idx = (u32) -1; + debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, + &mp->fixed_rate_idx); +#endif + + minstrel_ht_init_cck_rates(mp); + + return mp; } static void minstrel_ht_free(void *priv) { - mac80211_minstrel.free(priv); + kfree(priv); } static u32 minstrel_ht_get_expected_throughput(void *priv_sta) @@ -1384,7 +1469,6 @@ static const struct rate_control_ops mac80211_minstrel_ht = { .free = minstrel_ht_free, #ifdef CONFIG_MAC80211_DEBUGFS .add_sta_debugfs = minstrel_ht_add_sta_debugfs, - .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs, #endif .get_expected_throughput = minstrel_ht_get_expected_throughput, }; @@ -1409,14 +1493,14 @@ static void __init init_sample_table(void) } int __init -rc80211_minstrel_ht_init(void) +rc80211_minstrel_init(void) { init_sample_table(); return ieee80211_rate_control_register(&mac80211_minstrel_ht); } void -rc80211_minstrel_ht_exit(void) +rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel_ht); } diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index de1646c42e82..26b7a3244b47 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -15,11 +15,7 @@ */ #define MINSTREL_MAX_STREAMS 3 #define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT #define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */ -#else -#define MINSTREL_VHT_STREAM_GROUPS 0 -#endif #define MINSTREL_HT_GROUPS_NB (MINSTREL_MAX_STREAMS * \ MINSTREL_HT_STREAM_GROUPS) @@ -34,16 +30,13 @@ #define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB) #define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1) -#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT #define MCS_GROUP_RATES 10 -#else -#define MCS_GROUP_RATES 8 -#endif struct mcs_group { - u32 flags; - unsigned int streams; - unsigned int duration[MCS_GROUP_RATES]; + u16 flags; + u8 streams; + u8 shift; + u16 duration[MCS_GROUP_RATES]; }; extern const struct mcs_group minstrel_mcs_groups[]; @@ -110,17 +103,12 @@ struct minstrel_ht_sta_priv { struct minstrel_ht_sta ht; struct minstrel_sta_info legacy; }; -#ifdef CONFIG_MAC80211_DEBUGFS - struct dentry *dbg_stats; - struct dentry *dbg_stats_csv; -#endif void *ratelist; void *sample_table; bool is_ht; }; void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); -void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta); int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_ewma); diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index bfcc03152dc6..57820a5f2c16 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -15,6 +15,22 @@ #include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" +static ssize_t +minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct minstrel_debugfs_info *ms; + + ms = file->private_data; + return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len); +} + +static int +minstrel_stats_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + static char * minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) { @@ -41,7 +57,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; - unsigned int prob_ewmsd; + unsigned int duration; if (!(mi->supported[i] & BIT(j))) continue; @@ -79,21 +95,21 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) p += sprintf(p, " %3u ", idx); /* tx_time[rate(i)] in usec */ - tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); + duration = mg->duration[j]; + duration <<= mg->shift; + tx_time = DIV_ROUND_CLOSEST(duration, 1000); p += sprintf(p, "%6u ", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u" " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -130,9 +146,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "\n"); p += sprintf(p, - " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n"); + " best ____________rate__________ ____statistics___ _____last____ ______sum-of________\n"); p += sprintf(p, - "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) @@ -187,7 +203,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; - unsigned int prob_ewmsd; + unsigned int duration; if (!(mi->supported[i] & BIT(j))) continue; @@ -222,20 +238,21 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) } p += sprintf(p, "%u,", idx); - tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); + + duration = mg->duration[j]; + duration <<= mg->shift; + tx_time = DIV_ROUND_CLOSEST(duration, 1000); p += sprintf(p, "%u,", tx_time); tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -303,17 +320,8 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { struct minstrel_ht_sta_priv *msp = priv_sta; - msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp, - &minstrel_ht_stat_fops); - msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp, - &minstrel_ht_stat_csv_fops); -} - -void -minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta) -{ - struct minstrel_ht_sta_priv *msp = priv_sta; - - debugfs_remove(msp->dbg_stats); - debugfs_remove(msp->dbg_stats_csv); + debugfs_create_file("rc_stats", 0444, dir, msp, + &minstrel_ht_stat_fops); + debugfs_create_file("rc_stats_csv", 0444, dir, msp, + &minstrel_ht_stat_csv_fops); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 96611d5dfadb..3bd3b5769797 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -115,7 +115,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC | - RX_FLAG_ONLY_MONITOR)) + RX_FLAG_ONLY_MONITOR | + RX_FLAG_NO_PSDU)) return true; if (unlikely(skb->len < 16 + present_fcs_len + rtap_space)) @@ -189,6 +190,15 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); } + if (status->flag & RX_FLAG_NO_PSDU) + len += 1; + + if (status->flag & RX_FLAG_RADIOTAP_LSIG) { + len = ALIGN(len, 2); + len += 4; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_lsig) != 4); + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -279,6 +289,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, struct ieee80211_vendor_radiotap rtap = {}; struct ieee80211_radiotap_he he = {}; struct ieee80211_radiotap_he_mu he_mu = {}; + struct ieee80211_radiotap_lsig lsig = {}; if (status->flag & RX_FLAG_RADIOTAP_HE) { he = *(struct ieee80211_radiotap_he *)skb->data; @@ -291,6 +302,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, skb_pull(skb, sizeof(he_mu)); } + if (status->flag & RX_FLAG_RADIOTAP_LSIG) { + lsig = *(struct ieee80211_radiotap_lsig *)skb->data; + skb_pull(skb, sizeof(lsig)); + } + if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { rtap = *(struct ieee80211_vendor_radiotap *)skb->data; /* rtap.len and rtap.pad are undone immediately */ @@ -549,7 +565,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (status->encoding == RX_ENC_HE && status->flag & RX_FLAG_RADIOTAP_HE) { -#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) +#define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { he.data6 |= HE_PREP(DATA6_NSTS, @@ -630,6 +646,21 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += sizeof(he_mu); } + if (status->flag & RX_FLAG_NO_PSDU) { + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU); + *pos++ = status->zero_length_psdu_type; + } + + if (status->flag & RX_FLAG_RADIOTAP_LSIG) { + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG); + memcpy(pos, &lsig, sizeof(lsig)); + pos += sizeof(lsig); + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -1505,7 +1536,7 @@ static void sta_ps_start(struct sta_info *sta) if (!sta->sta.txq[0]) return; - for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { + for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { if (txq_has_queue(sta->sta.txq[tid])) set_bit(tid, &sta->txq_buffered_tids); else @@ -2046,6 +2077,7 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, idx = sdata->fragment_next; for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) { struct ieee80211_hdr *f_hdr; + struct sk_buff *f_skb; idx--; if (idx < 0) @@ -2057,7 +2089,8 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, entry->last_frag + 1 != frag) continue; - f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data; + f_skb = __skb_peek(&entry->skb_list); + f_hdr = (struct ieee80211_hdr *) f_skb->data; /* * Check ftype and addresses are equal, else check next fragment @@ -2314,7 +2347,7 @@ __ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control) if (!sdata->u.mgd.use_4addr) return -1; - else + else if (!ether_addr_equal(hdr->addr1, sdata->vif.addr)) check_port_control = true; } @@ -2425,8 +2458,9 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) if (!xmit_skb) net_info_ratelimited("%s: failed to clone multicast frame\n", dev->name); - } else if (!is_multicast_ether_addr(ehdr->h_dest)) { - dsta = sta_info_get(sdata, skb->data); + } else if (!is_multicast_ether_addr(ehdr->h_dest) && + !ether_addr_equal(ehdr->h_dest, ehdr->h_source)) { + dsta = sta_info_get(sdata, ehdr->h_dest); if (dsta) { /* * The destination station is associated to @@ -4207,11 +4241,10 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if (fast_rx->internal_forward) { struct sk_buff *xmit_skb = NULL; - bool multicast = is_multicast_ether_addr(skb->data); - - if (multicast) { + if (is_multicast_ether_addr(addrs.da)) { xmit_skb = skb_copy(skb, GFP_ATOMIC); - } else if (sta_info_get(rx->sdata, skb->data)) { + } else if (!ether_addr_equal(addrs.da, addrs.sa) && + sta_info_get(rx->sdata, addrs.da)) { xmit_skb = skb; skb = NULL; } diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 029334835747..4e4902bdbef8 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -144,6 +144,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, wide_bw_chansw_ie->new_center_freq_seg1, /* .basic_mcs_set doesn't matter */ }; + struct ieee80211_ht_operation ht_oper = {}; /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, * to the previously parsed chandef @@ -151,7 +152,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, new_vht_chandef = csa_ie->chandef; /* ignore if parsing fails */ - if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef)) + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + &vht_oper, &ht_oper, + &new_vht_chandef)) new_vht_chandef.chan = NULL; if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ && diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f34202242d24..fb8c2252ac0e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -113,7 +113,12 @@ static void __cleanup_single_sta(struct sta_info *sta) if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); + struct txq_info *txqi; + + if (!sta->sta.txq[i]) + continue; + + txqi = to_txq_info(sta->sta.txq[i]); spin_lock_bh(&fq->lock); ieee80211_txq_purge(local, txqi); @@ -374,6 +379,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; + /* might not do anything for the bufferable MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } } @@ -1239,13 +1245,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); - if (sta->sta.txq[0]) { - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - if (!txq_has_queue(sta->sta.txq[i])) - continue; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) + continue; - drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i])); - } + drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); @@ -1683,7 +1687,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { - if (!(driver_release_tids & BIT(tid)) || + if (!sta->sta.txq[tid] || + !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; @@ -2323,13 +2328,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } - if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) && - !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && + sta->status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->status_stats.avg_ack_signal); sinfo->filled |= - BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG); + BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9a6d7208bf4f..aa4afbf0abaf 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -479,11 +479,6 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (!skb) return; - if (dropped) { - dev_kfree_skb_any(skb); - return; - } - if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; @@ -506,6 +501,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, } rcu_read_unlock(); + dev_kfree_skb_any(skb); + } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ @@ -811,7 +808,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) - ieee80211s_update_metric(local, sta, skb); + ieee80211s_update_metric(local, sta, status); if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); @@ -972,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } rate_control_tx_status(local, sband, status); + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) + ieee80211s_update_metric(local, sta, status); } if (acked || noack_success) { @@ -988,6 +987,25 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_tx_status_ext); +void ieee80211_tx_rate_update(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, + struct ieee80211_tx_info *info) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band]; + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + struct ieee80211_tx_status status = { + .info = info, + .sta = pubsta, + }; + + rate_control_tx_status(local, sband, &status); + + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) + sta->tx_stats.last_rate = info->status.rates[0]; +} +EXPORT_SYMBOL(ieee80211_tx_rate_update); + void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 5cd5e6e5834e..6c647f425e05 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -16,6 +16,7 @@ #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" +#include "wme.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -1010,14 +1011,13 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: case WLAN_TDLS_SETUP_RESPONSE: - skb_set_queue_mapping(skb, IEEE80211_AC_BK); - skb->priority = 2; + skb->priority = 256 + 2; break; default: - skb_set_queue_mapping(skb, IEEE80211_AC_VI); - skb->priority = 5; + skb->priority = 256 + 5; break; } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 0ab69a1964f8..588c51a67c89 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2600,6 +2600,29 @@ TRACE_EVENT(drv_wake_tx_queue, ) ); +TRACE_EVENT(drv_get_ftm_responder_stats, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_ftm_responder_stats *ftm_stats), + + TP_ARGS(local, sdata, ftm_stats), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f353d9db54bc..e0ccee23fbcd 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -214,6 +214,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) @@ -242,6 +243,9 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) + return TX_CONTINUE; + ifmgd = &tx->sdata->u.mgd; /* @@ -1249,10 +1253,18 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; - if (!ieee80211_is_data_present(hdr->frame_control)) - return NULL; - - if (sta) { + if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) { + if ((!ieee80211_is_mgmt(hdr->frame_control) || + ieee80211_is_bufferable_mmpdu(hdr->frame_control) || + vif->type == NL80211_IFTYPE_STATION) && + sta && sta->uploaded) { + /* + * This will be NULL if the driver didn't set the + * opt-in hardware flag. + */ + txq = sta->sta.txq[IEEE80211_NUM_TIDS]; + } + } else if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (!sta->uploaded) @@ -1440,16 +1452,33 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, txqi->txq.vif = &sdata->vif; - if (sta) { - txqi->txq.sta = &sta->sta; - sta->sta.txq[tid] = &txqi->txq; - txqi->txq.tid = tid; - txqi->txq.ac = ieee80211_ac_from_tid(tid); - } else { + if (!sta) { sdata->vif.txq = &txqi->txq; txqi->txq.tid = 0; txqi->txq.ac = IEEE80211_AC_BE; + + return; } + + if (tid == IEEE80211_NUM_TIDS) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + /* Drivers need to opt in to the management MPDU TXQ */ + if (!ieee80211_hw_check(&sdata->local->hw, + STA_MMPDU_TXQ)) + return; + } else if (!ieee80211_hw_check(&sdata->local->hw, + BUFF_MMPDU_TXQ)) { + /* Drivers need to opt in to the bufferable MMPDU TXQ */ + return; + } + txqi->txq.ac = IEEE80211_AC_VO; + } else { + txqi->txq.ac = ieee80211_ac_from_tid(tid); + } + + txqi->txq.sta = &sta->sta; + txqi->txq.tid = tid; + sta->sta.txq[tid] = &txqi->txq; } void ieee80211_txq_purge(struct ieee80211_local *local, @@ -1890,7 +1919,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) - return false; + return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; @@ -2951,6 +2980,10 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) goto out; + /* Key is being removed */ + if (build.key->flags & KEY_FLAG_TAINTED) + goto out; + switch (build.key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: @@ -3196,6 +3229,10 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_rc_amsdu_len); + if (sta->sta.max_tid_amsdu_len[tid]) + max_amsdu_len = min_t(int, max_amsdu_len, + sta->sta.max_tid_amsdu_len[tid]); + spin_lock_bh(&fq->lock); /* TODO: Ideally aggregation should be done on dequeue to remain @@ -3228,6 +3265,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (max_frags && nfrags > max_frags) goto out; + if (!drv_can_aggregate_in_amsdu(local, head, skb)) + goto out; + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; @@ -3472,12 +3512,18 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; - struct ieee80211_vif *vif; + struct ieee80211_vif *vif = txq->vif; spin_lock_bh(&fq->lock); - if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) + if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) || + test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags)) + goto out; + + if (vif->txqs_stopped[ieee80211_ac_from_tid(txq->tid)]) { + set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags); goto out; + } /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); @@ -3573,6 +3619,7 @@ begin: } IEEE80211_SKB_CB(skb)->control.vif = vif; + out: spin_unlock_bh(&fq->lock); @@ -3601,13 +3648,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (!IS_ERR_OR_NULL(sta)) { struct ieee80211_fast_tx *fast_tx; - /* We need a bit of data queued to build aggregates properly, so - * instruct the TCP stack to allow more than a single ms of data - * to be queued in the stack. The value is a bit-shift of 1 - * second, so 8 is ~4ms of queued data. Only affects local TCP - * sockets. - */ - sk_pacing_shift_update(skb->sk, 8); + sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); fast_tx = rcu_dereference(sta->fast_tx); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 716cd6442d86..bec424316ea4 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -240,6 +240,102 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_ctstoself_duration); +static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_vif *vif = &sdata->vif; + struct fq *fq = &local->fq; + struct ps_data *ps = NULL; + struct txq_info *txqi; + struct sta_info *sta; + int i; + + spin_lock_bh(&fq->lock); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + ps = &sdata->bss->ps; + + sdata->vif.txqs_stopped[ac] = false; + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata) + continue; + + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct ieee80211_txq *txq = sta->sta.txq[i]; + + if (!txq) + continue; + + txqi = to_txq_info(txq); + + if (ac != txq->ac) + continue; + + if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, + &txqi->flags)) + continue; + + spin_unlock_bh(&fq->lock); + drv_wake_tx_queue(local, txqi); + spin_lock_bh(&fq->lock); + } + } + + if (!vif->txq) + goto out; + + txqi = to_txq_info(vif->txq); + + if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) || + (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) + goto out; + + spin_unlock_bh(&fq->lock); + + drv_wake_tx_queue(local, txqi); + return; +out: + spin_unlock_bh(&fq->lock); +} + +void ieee80211_wake_txqs(unsigned long data) +{ + struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_sub_if_data *sdata; + int n_acs = IEEE80211_NUM_ACS; + unsigned long flags; + int i; + + rcu_read_lock(); + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + + if (local->hw.queues < IEEE80211_NUM_ACS) + n_acs = 1; + + for (i = 0; i < local->hw.queues; i++) { + if (local->queue_stop_reasons[i]) + continue; + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + int ac; + + for (ac = 0; ac < n_acs; ac++) { + int ac_queue = sdata->vif.hw_queue[ac]; + + if (ac_queue == i || + sdata->vif.cab_queue == i) + __ieee80211_wake_txqs(sdata, ac); + } + } + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + } + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + rcu_read_unlock(); +} + void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) { struct ieee80211_sub_if_data *sdata; @@ -308,6 +404,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, rcu_read_unlock(); } else tasklet_schedule(&local->tx_pending_tasklet); + + if (local->ops->wake_tx_queue) + tasklet_schedule(&local->wake_txqs_tasklet); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, @@ -351,9 +450,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) return; - if (local->ops->wake_tx_queue) - return; - if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -366,8 +462,15 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, for (ac = 0; ac < n_acs; ac++) { if (sdata->vif.hw_queue[ac] == queue || - sdata->vif.cab_queue == queue) - netif_stop_subqueue(sdata->dev, ac); + sdata->vif.cab_queue == queue) { + if (!local->ops->wake_tx_queue) { + netif_stop_subqueue(sdata->dev, ac); + continue; + } + spin_lock(&local->fq.lock); + sdata->vif.txqs_stopped[ac] = true; + spin_unlock(&local->fq.lock); + } } } rcu_read_unlock(); @@ -2075,6 +2178,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; + if (sdata->vif.bss_conf.ftm_responder == 1 && + wiphy_ext_feature_isset(sdata->local->hw.wiphy, + NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) + changed |= BSS_CHANGED_FTM_RESPONDER; + if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; @@ -2657,49 +2765,65 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, return true; } -bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, + const struct ieee80211_vht_operation *oper, + const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; - int cf1, cf2; + int cf0, cf1; + int ccfs0, ccfs1, ccfs2; + int ccf0, ccf1; - if (!oper) + if (!oper || !htop) return false; - cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg0_idx, - chandef->chan->band); - cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, - chandef->chan->band); + ccfs0 = oper->center_freq_seg0_idx; + ccfs1 = oper->center_freq_seg1_idx; + ccfs2 = (le16_to_cpu(htop->operation_mode) & + IEEE80211_HT_OP_MODE_CCFS2_MASK) + >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; + + /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */ + ccf0 = ccfs0; + ccf1 = ccfs1; + if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + ccf1 = ccfs2; + + cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); + cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: + /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; - new.center_freq1 = cf1; + new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ - if (oper->center_freq_seg1_idx) { + if (ccf1) { unsigned int diff; - diff = abs(oper->center_freq_seg1_idx - - oper->center_freq_seg0_idx); + diff = abs(ccf1 - ccf0); if (diff == 8) { new.width = NL80211_CHAN_WIDTH_160; - new.center_freq1 = cf2; + new.center_freq1 = cf1; } else if (diff > 8) { new.width = NL80211_CHAN_WIDTH_80P80; - new.center_freq2 = cf2; + new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: + /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; - new.center_freq1 = cf1; + new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; - new.center_freq1 = cf1; - new.center_freq2 = cf2; + new.center_freq1 = cf0; + new.center_freq2 = cf1; break; default: return false; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 259325cbcc31..006d82e4a397 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -3,6 +3,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -231,6 +232,13 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); + /* copy EXT_NSS_BW Support value or remove the capability */ + if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW)) + vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); + else + vht_cap->vht_mcs.tx_highest &= + ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); + /* but also restrict MCSes */ for (i = 0; i < 8; i++) { u16 own_rx, own_tx, peer_rx, peer_tx; @@ -294,6 +302,18 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, break; default: sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + + if (!(vht_cap->vht_mcs.tx_highest & + cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) + break; + + /* + * If this is non-zero, then it does support 160 MHz after all, + * in one form or the other. We don't distinguish here (or even + * above) between 160 and 80+80 yet. + */ + if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 2fb703d70803..7e29f88dbf6a 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -146,18 +146,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) goto err_tfm; } - key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); + key->tfm0 = crypto_alloc_sync_skcipher("ctr(aes)", 0, 0); if (IS_ERR(key->tfm0)) goto err_tfm; - if (crypto_skcipher_setkey(key->tfm0, template->key, + if (crypto_sync_skcipher_setkey(key->tfm0, template->key, IEEE802154_LLSEC_KEY_SIZE)) goto err_tfm0; return key; err_tfm0: - crypto_free_skcipher(key->tfm0); + crypto_free_sync_skcipher(key->tfm0); err_tfm: for (i = 0; i < ARRAY_SIZE(key->tfm); i++) if (key->tfm[i]) @@ -177,7 +177,7 @@ static void llsec_key_release(struct kref *ref) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) crypto_free_aead(key->tfm[i]); - crypto_free_skcipher(key->tfm0); + crypto_free_sync_skcipher(key->tfm0); kzfree(key); } @@ -622,7 +622,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, { u8 iv[16]; struct scatterlist src; - SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err, datalen; unsigned char *data; @@ -632,7 +632,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, datalen = skb_tail_pointer(skb) - data; sg_init_one(&src, data, datalen); - skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); @@ -840,7 +840,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, unsigned char *data; int datalen; struct scatterlist src; - SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err; llsec_geniv(iv, dev_addr, &hdr->sec); @@ -849,7 +849,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, sg_init_one(&src, data, datalen); - skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h index 6f3b658e3279..8be46d74dc39 100644 --- a/net/mac802154/llsec.h +++ b/net/mac802154/llsec.h @@ -29,7 +29,7 @@ struct mac802154_llsec_key { /* one tfm for each authsize (4/8/16) */ struct crypto_aead *tfm[3]; - struct crypto_skcipher *tfm0; + struct crypto_sync_skcipher *tfm0; struct kref ref; }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8fbe6cdbe255..7d55d4c04088 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1223,7 +1223,7 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_mpls_policy, NULL); + devconf_mpls_policy, extack); if (err < 0) goto errout; @@ -1263,6 +1263,7 @@ errout: static int mpls_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct hlist_head *head; struct net_device *dev; @@ -1270,6 +1271,21 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, int idx, s_idx; int h, s_h; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -1286,7 +1302,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, goto cont; if (mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -2015,30 +2031,140 @@ nla_put_failure: return -EMSGSIZE; } +#if IS_ENABLED(CONFIG_INET) +static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, + struct fib_dump_filter *filter, + struct netlink_callback *cb) +{ + return ip_valid_fib_dump_req(net, nlh, filter, cb); +} +#else +static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, + struct fib_dump_filter *filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[RTA_MAX + 1]; + struct rtmsg *rtm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); + return -EINVAL; + } + + rtm = nlmsg_data(nlh); + if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || + rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || + rtm->rtm_flags) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request"); + return -EINVAL; + } + + if (rtm->rtm_protocol) { + filter->protocol = rtm->rtm_protocol; + filter->filter_set = 1; + cb->answer_flags = NLM_F_DUMP_FILTERED; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= RTA_MAX; ++i) { + int ifindex; + + if (i == RTA_OIF) { + ifindex = nla_get_u32(tb[i]); + filter->dev = __dev_get_by_index(net, ifindex); + if (!filter->dev) + return -ENODEV; + filter->filter_set = 1; + } else if (tb[i]) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} +#endif + +static bool mpls_rt_uses_dev(struct mpls_route *rt, + const struct net_device *dev) +{ + struct net_device *nh_dev; + + if (rt->rt_nhn == 1) { + struct mpls_nh *nh = rt->rt_nh; + + nh_dev = rtnl_dereference(nh->nh_dev); + if (dev == nh_dev) + return true; + } else { + for_nexthops(rt) { + nh_dev = rtnl_dereference(nh->nh_dev); + if (nh_dev == dev) + return true; + } endfor_nexthops(rt); + } + + return false; +} + static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; + struct fib_dump_filter filter = {}; + unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; ASSERT_RTNL(); + if (cb->strict_check) { + int err; + + err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); + if (err < 0) + return err; + + /* for MPLS, there is only 1 table with fixed type and flags. + * If either are set in the filter then return nothing. + */ + if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || + (filter.rt_type && filter.rt_type != RTN_UNICAST) || + filter.flags) + return skb->len; + } + index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; + + if (filter.filter_set) + flags |= NLM_F_DUMP_FILTERED; + for (; index < platform_labels; index++) { struct mpls_route *rt; + rt = rtnl_dereference(platform_label[index]); if (!rt) continue; + if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) || + (filter.protocol && rt->rt_protocol != filter.protocol)) + continue; + if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, - index, rt, NLM_F_MULTI) < 0) + index, rt, flags) < 0) break; } cb->args[0] = index; diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig index 08a8a6031fd7..7f2b46108a24 100644 --- a/net/ncsi/Kconfig +++ b/net/ncsi/Kconfig @@ -10,3 +10,9 @@ config NET_NCSI support. Enable this only if your system connects to a network device via NCSI and the ethernet driver you're using supports the protocol explicitly. +config NCSI_OEM_CMD_GET_MAC + bool "Get NCSI OEM MAC Address" + depends on NET_NCSI + ---help--- + This allows to get MAC address from NCSI firmware and set them back to + controller. diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 8055e3965cef..1dae77c54009 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -68,6 +68,17 @@ enum { NCSI_MODE_MAX }; +/* OEM Vendor Manufacture ID */ +#define NCSI_OEM_MFR_MLX_ID 0x8119 +#define NCSI_OEM_MFR_BCM_ID 0x113d +/* Broadcom specific OEM Command */ +#define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ +/* OEM Command payload lengths*/ +#define NCSI_OEM_BCM_CMD_GMA_LEN 12 +/* Mac address offset in OEM response */ +#define BCM_MAC_ADDR_OFFSET 28 + + struct ncsi_channel_version { u32 version; /* Supported BCD encoded NCSI version */ u32 alpha2; /* Supported BCD encoded NCSI version */ @@ -171,6 +182,8 @@ struct ncsi_package; #define NCSI_RESERVED_CHANNEL 0x1f #define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) #define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) +#define NCSI_MAX_PACKAGE 8 +#define NCSI_MAX_CHANNEL 32 struct ncsi_channel { unsigned char id; @@ -216,11 +229,15 @@ struct ncsi_request { bool used; /* Request that has been assigned */ unsigned int flags; /* NCSI request property */ #define NCSI_REQ_FLAG_EVENT_DRIVEN 1 +#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ struct sk_buff *cmd; /* Associated NCSI command packet */ struct sk_buff *rsp; /* Associated NCSI response packet */ struct timer_list timer; /* Timer on waiting for response */ bool enabled; /* Time has been enabled or not */ + u32 snd_seq; /* netlink sending sequence number */ + u32 snd_portid; /* netlink portid of sender */ + struct nlmsghdr nlhdr; /* netlink message header */ }; enum { @@ -236,6 +253,7 @@ enum { ncsi_dev_state_probe_dp, ncsi_dev_state_config_sp = 0x0301, ncsi_dev_state_config_cis, + ncsi_dev_state_config_oem_gma, ncsi_dev_state_config_clear_vids, ncsi_dev_state_config_svf, ncsi_dev_state_config_ev, @@ -269,6 +287,7 @@ struct ncsi_dev_priv { #define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ #define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ #define NCSI_DEV_RESHUFFLE 4 + unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ #if IS_ENABLED(CONFIG_IPV6) unsigned int inet6_addr_num; /* Number of IPv6 addresses */ @@ -305,6 +324,8 @@ struct ncsi_cmd_arg { unsigned short words[8]; unsigned int dwords[4]; }; + unsigned char *data; /* NCSI OEM data */ + struct genl_info *info; /* Netlink information */ }; extern struct list_head ncsi_dev_list; diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 7567ca63aae2..356af474e43c 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" @@ -211,6 +212,25 @@ static int ncsi_cmd_handler_snfc(struct sk_buff *skb, return 0; } +static int ncsi_cmd_handler_oem(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_oem_pkt *cmd; + unsigned int len; + + len = sizeof(struct ncsi_cmd_pkt_hdr) + 4; + if (nca->payload < 26) + len += 26; + else + len += nca->payload; + + cmd = skb_put_zero(skb, len); + memcpy(&cmd->mfr_id, nca->data, nca->payload); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + static struct ncsi_cmd_handler { unsigned char type; int payload; @@ -244,7 +264,7 @@ static struct ncsi_cmd_handler { { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default }, { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default }, { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, - { NCSI_PKT_CMD_OEM, 0, NULL }, + { NCSI_PKT_CMD_OEM, -1, ncsi_cmd_handler_oem }, { NCSI_PKT_CMD_PLDM, 0, NULL }, { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } }; @@ -316,12 +336,24 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) return -ENOENT; } - /* Get packet payload length and allocate the request */ - nca->payload = nch->payload; + /* Get packet payload length and allocate the request + * It is expected that if length set as negative in + * handler structure means caller is initializing it + * and setting length in nca before calling xmit function + */ + if (nch->payload >= 0) + nca->payload = nch->payload; nr = ncsi_alloc_command(nca); if (!nr) return -ENOMEM; + /* track netlink information */ + if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + nr->snd_seq = nca->info->snd_seq; + nr->snd_portid = nca->info->snd_portid; + nr->nlhdr = *nca->info->nlhdr; + } + /* Prepare the packet */ nca->id = nr->id; ret = nch->handler(nr->cmd, nca); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 091284760d21..bfc43b28c7a6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" @@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t) { struct ncsi_request *nr = from_timer(nr, t, timer); struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_cmd_pkt *cmd; + struct ncsi_package *np; + struct ncsi_channel *nc; unsigned long flags; /* If the request already had associated response, @@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t) } spin_unlock_irqrestore(&ndp->lock, flags); + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + if (nr->cmd) { + /* Find the package */ + cmd = (struct ncsi_cmd_pkt *) + skb_network_header(nr->cmd); + ncsi_find_package_and_channel(ndp, + cmd->cmd.common.channel, + &np, &nc); + ncsi_send_netlink_timeout(nr, np, nc); + } + } + /* Release the request */ ncsi_free_request(nr); } @@ -635,6 +651,72 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + +/* NCSI OEM Command APIs */ +static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) +{ + unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN]; + int ret = 0; + + nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN; + + memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN); + *(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID); + data[5] = NCSI_OEM_BCM_CMD_GMA; + + nca->data = data; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + return ret; +} + +/* OEM Command handlers initialization */ +static struct ncsi_oem_gma_handler { + unsigned int mfr_id; + int (*handler)(struct ncsi_cmd_arg *nca); +} ncsi_oem_gma_handlers[] = { + { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm } +}; + +static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) +{ + struct ncsi_oem_gma_handler *nch = NULL; + int i; + + /* This function should only be called once, return if flag set */ + if (nca->ndp->gma_flag == 1) + return -1; + + /* Find gma handler for given manufacturer id */ + for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) { + if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) { + if (ncsi_oem_gma_handlers[i].handler) + nch = &ncsi_oem_gma_handlers[i]; + break; + } + } + + if (!nch) { + netdev_err(nca->ndp->ndev.dev, + "NCSI: No GMA handler available for MFR-ID (0x%x)\n", + mf_id); + return -1; + } + + /* Set the flag for GMA command which should only be called once */ + nca->ndp->gma_flag = 1; + + /* Get Mac address from NCSI device */ + return nch->handler(nca); +} + +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ + static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -685,7 +767,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) goto error; } + nd->state = ncsi_dev_state_config_oem_gma; + break; + case ncsi_dev_state_config_oem_gma: nd->state = ncsi_dev_state_config_clear_vids; + ret = -1; + +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + nca.type = NCSI_PKT_CMD_OEM; + nca.package = np->id; + nca.channel = nc->id; + ndp->pending_req_num = 1; + ret = ncsi_gma_handler(&nca, nc->version.mf_id); +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ + + if (ret < 0) + schedule_work(&ndp->work); + break; case ncsi_dev_state_config_clear_vids: case ncsi_dev_state_config_svf: diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 45f33d6dedf7..33314381b4f5 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -20,6 +19,7 @@ #include #include "internal.h" +#include "ncsi-pkt.h" #include "ncsi-netlink.h" static struct genl_family ncsi_genl_family; @@ -29,6 +29,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, + [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) @@ -366,6 +367,202 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) return 0; } +static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + struct ncsi_pkt_hdr *hdr; + struct ncsi_cmd_arg nca; + unsigned char *data; + u32 package_id; + u32 channel_id; + int len, ret; + + if (!info || !info->attrs) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_IFINDEX]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_DATA]) { + ret = -EINVAL; + goto out; + } + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) { + ret = -ENODEV; + goto out; + } + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + + if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) { + ret = -ERANGE; + goto out_netlink; + } + + len = nla_len(info->attrs[NCSI_ATTR_DATA]); + if (len < sizeof(struct ncsi_pkt_hdr)) { + netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n", + package_id); + ret = -EINVAL; + goto out_netlink; + } else { + data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]); + } + + hdr = (struct ncsi_pkt_hdr *)data; + + nca.ndp = ndp; + nca.package = (unsigned char)package_id; + nca.channel = (unsigned char)channel_id; + nca.type = hdr->type; + nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN; + nca.info = info; + nca.payload = ntohs(hdr->length); + nca.data = data + sizeof(*hdr); + + ret = ncsi_xmit_cmd(&nca); +out_netlink: + if (ret != 0) { + netdev_err(ndp->ndev.dev, + "NCSI: Error %d sending command\n", + ret); + ncsi_send_netlink_err(ndp->ndev.dev, + info->snd_seq, + info->snd_portid, + info->nlhdr, + ret); + } +out: + return ret; +} + +int ncsi_send_netlink_rsp(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc) +{ + struct sk_buff *skb; + struct net *net; + void *hdr; + int rc; + + net = dev_net(nr->rsp->dev); + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); + if (!hdr) { + kfree_skb(skb); + return -EMSGSIZE; + } + + nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex); + if (np) + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); + if (nc) + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); + else + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); + + rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data); + if (rc) + goto err; + + genlmsg_end(skb, hdr); + return genlmsg_unicast(net, skb, nr->snd_portid); + +err: + kfree_skb(skb); + return rc; +} + +int ncsi_send_netlink_timeout(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc) +{ + struct sk_buff *skb; + struct net *net; + void *hdr; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); + if (!hdr) { + kfree_skb(skb); + return -EMSGSIZE; + } + + net = dev_net(nr->cmd->dev); + + nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex); + + if (np) + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); + else + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, + NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *) + nr->cmd->data)->channel))); + + if (nc) + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); + else + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); + + genlmsg_end(skb, hdr); + return genlmsg_unicast(net, skb, nr->snd_portid); +} + +int ncsi_send_netlink_err(struct net_device *dev, + u32 snd_seq, + u32 snd_portid, + struct nlmsghdr *nlhdr, + int err) +{ + struct nlmsghdr *nlh; + struct nlmsgerr *nle; + struct sk_buff *skb; + struct net *net; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + net = dev_net(dev); + + nlh = nlmsg_put(skb, snd_portid, snd_seq, + NLMSG_ERROR, sizeof(*nle), 0); + nle = (struct nlmsgerr *)nlmsg_data(nlh); + nle->error = err; + memcpy(&nle->msg, nlhdr, sizeof(*nlh)); + + nlmsg_end(skb, nlh); + + return nlmsg_unicast(net->genl_sock, skb, snd_portid); +} + static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, @@ -386,6 +583,12 @@ static const struct genl_ops ncsi_ops[] = { .doit = ncsi_clear_interface_nl, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NCSI_CMD_SEND_CMD, + .policy = ncsi_genl_policy, + .doit = ncsi_send_cmd_nl, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family ncsi_genl_family __ro_after_init = { diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 91a5c256f8c4..c4a46887a932 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -14,6 +14,18 @@ #include "internal.h" +int ncsi_send_netlink_rsp(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc); +int ncsi_send_netlink_timeout(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc); +int ncsi_send_netlink_err(struct net_device *dev, + u32 snd_seq, + u32 snd_portid, + struct nlmsghdr *nlhdr, + int err); + int ncsi_init_netlink(struct net_device *dev); int ncsi_unregister_netlink(struct net_device *dev); diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 91b4b66438df..4d3f06be38bd 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -151,6 +151,28 @@ struct ncsi_cmd_snfc_pkt { unsigned char pad[22]; }; +/* OEM Request Command as per NCSI Specification */ +struct ncsi_cmd_oem_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mfr_id; /* Manufacture ID */ + unsigned char data[]; /* OEM Payload Data */ +}; + +/* OEM Response Packet as per NCSI Specification */ +struct ncsi_rsp_oem_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Command header */ + __be32 mfr_id; /* Manufacture ID */ + unsigned char data[]; /* Payload data */ +}; + +/* Broadcom Response Data */ +struct ncsi_rsp_oem_bcm_pkt { + unsigned char ver; /* Payload Version */ + unsigned char type; /* OEM Command type */ + __be16 len; /* Payload Length */ + unsigned char data[]; /* Cmd specific Data */ +}; + /* Get Link Status */ struct ncsi_rsp_gls_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 930c1d3796f0..77e07ba3f493 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -16,9 +16,11 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" +#include "ncsi-netlink.h" static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, unsigned short payload) @@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, * before calling this function. */ h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp); - if (h->common.revision != NCSI_PKT_REVISION) + + if (h->common.revision != NCSI_PKT_REVISION) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: unsupported header revision\n"); return -EINVAL; - if (ntohs(h->common.length) != payload) + } + if (ntohs(h->common.length) != payload) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: payload length mismatched\n"); return -EINVAL; + } /* Check on code and reason */ if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || - ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) - return -EINVAL; + ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: non zero response/reason code\n"); + return -EPERM; + } /* Validate checksum, which might be zeroes if the * sender doesn't support checksum according to NCSI @@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, checksum = ncsi_calculate_checksum((unsigned char *)h, sizeof(*h) + payload - 4); - if (*pchecksum != htonl(checksum)) + + if (*pchecksum != htonl(checksum)) { + netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n"); return -EINVAL; + } return 0; } @@ -596,6 +611,87 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } +/* Response handler for Broadcom command Get Mac Address */ +static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN); + /* Increase mac address by 1 for BMC's address */ + saddr.sa_data[ETH_ALEN - 1]++; + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + +/* Response handler for Broadcom card */ +static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_bcm_pkt *bcm; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data); + + if (bcm->type == NCSI_OEM_BCM_CMD_GMA) + return ncsi_rsp_handler_oem_bcm_gma(nr); + return 0; +} + +static struct ncsi_rsp_oem_handler { + unsigned int mfr_id; + int (*handler)(struct ncsi_request *nr); +} ncsi_rsp_oem_handlers[] = { + { NCSI_OEM_MFR_MLX_ID, NULL }, + { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm } +}; + +/* Response handler for OEM command */ +static int ncsi_rsp_handler_oem(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_handler *nrh = NULL; + struct ncsi_rsp_oem_pkt *rsp; + unsigned int mfr_id, i; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + mfr_id = ntohl(rsp->mfr_id); + + /* Check for manufacturer id and Find the handler */ + for (i = 0; i < ARRAY_SIZE(ncsi_rsp_oem_handlers); i++) { + if (ncsi_rsp_oem_handlers[i].mfr_id == mfr_id) { + if (ncsi_rsp_oem_handlers[i].handler) + nrh = &ncsi_rsp_oem_handlers[i]; + else + nrh = NULL; + + break; + } + } + + if (!nrh) { + netdev_err(nr->ndp->ndev.dev, "Received unrecognized OEM packet with MFR-ID (0x%x)\n", + mfr_id); + return -ENOENT; + } + + /* Process the packet */ + return nrh->handler(nr); +} + static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) { struct ncsi_rsp_gvi_pkt *rsp; @@ -900,6 +996,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr) return 0; } +static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_rsp_pkt *rsp; + struct ncsi_package *np; + struct ncsi_channel *nc; + int ret; + + /* Find the package */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, &nc); + if (!np) + return -ENODEV; + + ret = ncsi_send_netlink_rsp(nr, np, nc); + + return ret; +} + static struct ncsi_rsp_handler { unsigned char type; int payload; @@ -932,7 +1048,7 @@ static struct ncsi_rsp_handler { { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, - { NCSI_PKT_RSP_OEM, 0, NULL }, + { NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem }, { NCSI_PKT_RSP_PLDM, 0, NULL }, { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } }; @@ -1002,6 +1118,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, netdev_warn(ndp->ndev.dev, "NCSI: 'bad' packet ignored for type 0x%x\n", hdr->type); + + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + if (ret == -EPERM) + goto out_netlink; + else + ncsi_send_netlink_err(ndp->ndev.dev, + nr->snd_seq, + nr->snd_portid, + &nr->nlhdr, + ret); + } goto out; } @@ -1011,6 +1138,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, netdev_err(ndp->ndev.dev, "NCSI: Handler for packet type 0x%x returned %d\n", hdr->type, ret); + +out_netlink: + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + ret = ncsi_rsp_handler_netlink(nr); + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Netlink handler for packet type 0x%x returned %d\n", + hdr->type, ret); + } + } + out: ncsi_free_request(nr); return ret; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f61c306de1d0..2ab870ef233a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -625,6 +625,13 @@ config NFT_FIB_INET The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_XFRM + tristate "Netfilter nf_tables xfrm/IPSec security association matching" + depends on XFRM + help + This option adds an expression that you can use to extract properties + of a packets security association. + config NFT_SOCKET tristate "Netfilter nf_tables socket match support" depends on IPV6 || IPV6=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 16895e045b66..4ddf3ef51ece 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o +obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bc4bd247bb7d..1577f2f76060 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -55,11 +55,15 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); -/* When the nfnl mutex is held: */ +/* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(p) \ - rcu_dereference_protected(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&ip_set_ref_lock)) #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] +#define ip_set_ref_netlink(inst,id) \ + rcu_dereference_raw((inst)->ip_set_list)[id] /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -693,21 +697,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. - * We assume the set is referenced, so it does exist and - * can't be destroyed. The set cannot be renamed due to - * the referencing either. - * + * Set itself is protected by RCU, but its name isn't: to protect against + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the + * name. */ -const char * -ip_set_name_byindex(struct net *net, ip_set_id_t index) +void +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { - const struct ip_set *set = ip_set_rcu_get(net, index); + struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); - BUG_ON(set->ref == 0); - /* Referenced, so it's safe */ - return set->name; + read_lock_bh(&ip_set_ref_lock); + strncpy(name, set->name, IPSET_MAXNAMELEN); + read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -961,7 +964,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Wraparound */ goto cleanup; - list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -973,7 +976,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; - kfree(tmp); + kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; @@ -1153,7 +1156,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, if (!set) return -ENOENT; - read_lock_bh(&ip_set_ref_lock); + write_lock_bh(&ip_set_ref_lock); if (set->ref != 0) { ret = -IPSET_ERR_REFERENCED; goto out; @@ -1170,7 +1173,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, strncpy(set->name, name2, IPSET_MAXNAMELEN); out: - read_unlock_bh(&ip_set_ref_lock); + write_unlock_bh(&ip_set_ref_lock); return ret; } @@ -1252,7 +1255,7 @@ ip_set_dump_done(struct netlink_callback *cb) struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; - struct ip_set *set = ip_set(inst, index); + struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); @@ -1441,7 +1444,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - set = ip_set(inst, index); + set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); @@ -2059,7 +2062,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); + list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; @@ -2087,7 +2090,7 @@ ip_set_net_exit(struct net *net) } } nfnl_unlock(NFNL_SUBSYS_IPSET); - kfree(rcu_dereference_protected(inst->ip_set_list, 1)); + kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 8a33dac4e805..e287da68d5fa 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -15,7 +15,7 @@ #define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) #define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) + __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index d391485a6acd..613e18e720a4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -213,13 +213,13 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } @@ -493,13 +493,13 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 072a658fde04..4eef55da0878 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; - struct list_set *map = set->data; - ip_set_put_byindex(map->net, e->id); ip_set_ext_destroy(set, e); kfree(e); } @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu) static inline void list_set_del(struct ip_set *set, struct set_elem *e) { + struct list_set *map = set->data; + set->elements--; list_del_rcu(&e->list); + ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct set_elem *e, struct set_elem *old) +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { + struct list_set *map = set->data; + list_replace_rcu(&old->list, &e->list); + ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(e, n); + list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - if (nla_put_string(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(map->net, e->id))) + ip_set_name_byindex(map->net, e->id, name); + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7ca926a03b81..fe9abf3cc10a 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1686,8 +1686,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - ipv4_update_pmtu(skb, ipvs->net, - mtu, 0, 0, 0, 0); + ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) goto ignore_ipip; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 62eefea48973..83395bf6dc35 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3234,7 +3234,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, /* Try to find the service for which to dump destinations */ if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, - ip_vs_cmd_policy, NULL)) + ip_vs_cmd_policy, cb->extack)) goto out_err; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d4020c5e831d..2526be6b3d90 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1616,7 +1616,7 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) EnterFunction(7); /* Receive a packet */ - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, buflen); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a676d5f76bdc..e92e749aff53 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -379,7 +379,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, return false; } - l4proto = __nf_ct_l4proto_find(l3num, protonum); + l4proto = __nf_ct_l4proto_find(protonum); ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, l4proto); @@ -539,7 +539,7 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_tmpl_free(ct); return; } - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->destroy) l4proto->destroy(ct); @@ -840,7 +840,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, enum ip_conntrack_info oldinfo; struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->allow_clash && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { @@ -1073,19 +1073,22 @@ static unsigned int early_drop_list(struct net *net, return drops; } -static noinline int early_drop(struct net *net, unsigned int _hash) +static noinline int early_drop(struct net *net, unsigned int hash) { - unsigned int i; + unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned int hash, hsize, drops; + unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); - hash = reciprocal_scale(_hash++, hsize); + if (!i) + bucket = reciprocal_scale(hash, hsize); + else + bucket = (bucket + 1) % hsize; - drops = early_drop_list(net, &ct_hash[hash]); + drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { @@ -1109,7 +1112,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1370,12 +1373,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (!l4proto->new(ct, skb, dataoff)) { - nf_conntrack_free(ct); - pr_debug("can't track with proto module\n"); - return NULL; - } - if (timeout_ext) nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC); @@ -1436,12 +1433,12 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, /* On success, returns 0, sets skb->_nfct | ctinfo */ static int -resolve_normal_ct(struct net *net, struct nf_conn *tmpl, +resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - u_int16_t l3num, u_int8_t protonum, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto, + const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; @@ -1452,17 +1449,18 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, net, &tuple, l4proto)) { + dataoff, state->pf, protonum, state->net, + &tuple, l4proto)) { pr_debug("Can't get tuple\n"); return 0; } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, net); - h = __nf_conntrack_find_get(net, zone, &tuple, hash); + hash = hash_conntrack_raw(&tuple, state->net); + h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { - h = init_conntrack(net, tmpl, &tuple, l4proto, + h = init_conntrack(state->net, tmpl, &tuple, l4proto, skb, dataoff, hash); if (!h) return 0; @@ -1491,13 +1489,45 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, return 0; } +/* + * icmp packets need special treatment to handle error messages that are + * related to a connection. + * + * Callers need to check if skb has a conntrack assigned when this + * helper returns; in such case skb belongs to an already known connection. + */ +static unsigned int __cold +nf_conntrack_handle_icmp(struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + u8 protonum, + const struct nf_hook_state *state) +{ + int ret; + + if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) + ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); +#if IS_ENABLED(CONFIG_IPV6) + else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) + ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); +#endif + else + return NF_ACCEPT; + + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(state->net, error); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); + } + + return ret; +} + unsigned int -nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, - struct sk_buff *skb) +nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conntrack_l4proto *l4proto; - struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; + struct nf_conn *ct, *tmpl; u_int8_t protonum; int dataoff, ret; @@ -1506,32 +1536,28 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(net, ignore); + NF_CT_STAT_INC_ATOMIC(state->net, ignore); return NF_ACCEPT; } skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ - dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); + dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); - NF_CT_STAT_INC_ATOMIC(net, error); - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, error); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } - l4proto = __nf_ct_l4proto_find(pf, protonum); + l4proto = __nf_ct_l4proto_find(protonum); - /* It may be an special packet, error, unclean... - * inverse of the return code tells to the netfilter - * core what to do with the packet. */ - if (l4proto->error != NULL) { - ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum); + if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { + ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, + protonum, state); if (ret <= 0) { - NF_CT_STAT_INC_ATOMIC(net, error); - NF_CT_STAT_INC_ATOMIC(net, invalid); ret = -ret; goto out; } @@ -1540,10 +1566,11 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); + ret = resolve_normal_ct(tmpl, skb, dataoff, + protonum, l4proto, state); if (ret < 0) { /* Too stressed to deal. */ - NF_CT_STAT_INC_ATOMIC(net, drop); + NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = NF_DROP; goto out; } @@ -1551,21 +1578,21 @@ repeat: ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } - ret = l4proto->packet(ct, skb, dataoff, ctinfo); + ret = l4proto->packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_conntrack_put(&ct->ct_general); skb->_nfct = 0; - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == -NF_DROP) - NF_CT_STAT_INC_ATOMIC(net, drop); + NF_CT_STAT_INC_ATOMIC(state->net, drop); /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. @@ -1594,8 +1621,7 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l4proto_find(orig->src.l3num, - orig->dst.protonum)); + __nf_ct_l4proto_find(orig->dst.protonum)); rcu_read_unlock(); return ret; } @@ -1752,7 +1778,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) if (dataoff <= 0) return -1; - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, l4num, net, &tuple, l4proto)) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 27b84231db10..3034038bfdf0 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,8 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l4proto_find(expect->tuple.src.l3num, - expect->tuple.dst.protonum)); + __nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 036207ecaf16..4ae8e528943a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -135,8 +135,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb, ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, - tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); @@ -184,7 +183,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) struct nlattr *nest_proto; int ret; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; @@ -592,7 +591,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); @@ -821,6 +820,7 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u8 family; struct { u_int32_t val; u_int32_t mask; @@ -828,31 +828,39 @@ struct ctnetlink_filter { }; static struct ctnetlink_filter * -ctnetlink_alloc_filter(const struct nlattr * const cda[]) +ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { -#ifdef CONFIG_NF_CONNTRACK_MARK struct ctnetlink_filter *filter; +#ifndef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) + return ERR_PTR(-EOPNOTSUPP); +#endif + filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) return ERR_PTR(-ENOMEM); - filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->family = family; - return filter; -#else - return ERR_PTR(-EOPNOTSUPP); +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + } #endif + return filter; } static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; struct ctnetlink_filter *filter = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u8 family = nfmsg->nfgen_family; - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { - filter = ctnetlink_alloc_filter(cda); + if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } @@ -866,13 +874,24 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) struct ctnetlink_filter *filter = data; if (filter == NULL) - return 1; + goto out; + + /* Match entries of a given L3 protocol number. + * If it is not specified, ie. l3proto == 0, + * then match everything. + */ + if (filter->family && nf_ct_l3num(ct) != filter->family) + goto ignore_entry; #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) == filter->mark.val) - return 1; + if ((ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; #endif +out: + return 1; + +ignore_entry: return 0; } @@ -883,8 +902,6 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - u_int8_t l3proto = nfmsg->nfgen_family; struct nf_conn *nf_ct_evict[8]; int res, i; spinlock_t *lockp; @@ -923,11 +940,6 @@ restart: if (!net_eq(net, nf_ct_net(ct))) continue; - /* Dump entries of a given L3 protocol number. - * If it is not specified, ie. l3proto == 0, - * then dump everything. */ - if (l3proto && nf_ct_l3num(ct) != l3proto) - continue; if (cb->args[1]) { if (ct != last) continue; @@ -1048,7 +1060,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_PROTO_MAX, @@ -1213,12 +1225,12 @@ static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], - u32 portid, int report) + u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { - filter = ctnetlink_alloc_filter(cda); + if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } @@ -1257,7 +1269,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, else { return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(nlh), u3); } if (err < 0) @@ -1696,7 +1708,7 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, return err; rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); rcu_read_unlock(); @@ -2656,8 +2668,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, - tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 51c5d7eec0a3..40643af7137e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -43,7 +43,7 @@ extern unsigned int nf_conntrack_net_id; -static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; +static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly; static DEFINE_MUTEX(nf_ct_proto_mutex); @@ -124,23 +124,21 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif -const struct nf_conntrack_l4proto * -__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) +const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto) { - if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL)) + if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos))) return &nf_conntrack_l4proto_generic; - return rcu_dereference(nf_ct_protos[l3proto][l4proto]); + return rcu_dereference(nf_ct_protos[l4proto]); } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); -const struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num) { const struct nf_conntrack_l4proto *p; rcu_read_lock(); - p = __nf_ct_l4proto_find(l3num, l4num); + p = __nf_ct_l4proto_find(l4num); if (!try_module_get(p->me)) p = &nf_conntrack_l4proto_generic; rcu_read_unlock(); @@ -159,8 +157,7 @@ static int kill_l4proto(struct nf_conn *i, void *data) { const struct nf_conntrack_l4proto *l4proto; l4proto = data; - return nf_ct_protonum(i) == l4proto->l4proto && - nf_ct_l3num(i) == l4proto->l3proto; + return nf_ct_protonum(i) == l4proto->l4proto; } static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, @@ -219,48 +216,20 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto) { int ret = 0; - if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)) - return -EBUSY; - if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) return -EINVAL; mutex_lock(&nf_ct_proto_mutex); - if (!nf_ct_protos[l4proto->l3proto]) { - /* l3proto may be loaded latter. */ - struct nf_conntrack_l4proto __rcu **proto_array; - int i; - - proto_array = - kmalloc_array(MAX_NF_CT_PROTO, - sizeof(struct nf_conntrack_l4proto *), - GFP_KERNEL); - if (proto_array == NULL) { - ret = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < MAX_NF_CT_PROTO; i++) - RCU_INIT_POINTER(proto_array[i], - &nf_conntrack_l4proto_generic); - - /* Before making proto_array visible to lockless readers, - * we must make sure its content is committed to memory. - */ - smp_wmb(); - - nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (rcu_dereference_protected( - nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + if (rcu_dereference_protected( + nf_ct_protos[l4proto->l4proto], lockdep_is_held(&nf_ct_proto_mutex) ) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], - l4proto); + rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto); out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; @@ -274,7 +243,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net, struct nf_proto_net *pn = NULL; if (l4proto->init_net) { - ret = l4proto->init_net(net, l4proto->l3proto); + ret = l4proto->init_net(net); if (ret < 0) goto out; } @@ -296,13 +265,13 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) { - BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)); + BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos)); BUG_ON(rcu_dereference_protected( - nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + nf_ct_protos[l4proto->l4proto], lockdep_is_held(&nf_ct_proto_mutex) ) != l4proto); - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], &nf_conntrack_l4proto_generic); } @@ -352,7 +321,7 @@ static int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], unsigned int num_proto) { - int ret = -EINVAL, ver; + int ret = -EINVAL; unsigned int i; for (i = 0; i < num_proto; i++) { @@ -361,9 +330,8 @@ nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], break; } if (i != num_proto) { - ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4; - pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n", - ver, l4proto[i]->l4proto); + pr_err("nf_conntrack: can't register l4 %d proto.\n", + l4proto[i]->l4proto); nf_ct_l4proto_unregister(l4proto, i); } return ret; @@ -382,9 +350,8 @@ int nf_ct_l4proto_pernet_register(struct net *net, break; } if (i != num_proto) { - pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n", - l4proto[i]->l4proto, - l4proto[i]->l3proto == PF_INET6 ? 6 : 4); + pr_err("nf_conntrack %d: pernet registration failed\n", + l4proto[i]->l4proto); nf_ct_l4proto_pernet_unregister(net, l4proto, i); } return ret; @@ -455,7 +422,7 @@ static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv4_conntrack_local(void *priv, @@ -477,7 +444,7 @@ static unsigned int ipv4_conntrack_local(void *priv, return NF_ACCEPT; } - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); + return nf_conntrack_in(skb, state); } /* Connection tracking may drop packets, but never alters them, so @@ -690,14 +657,14 @@ static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv6_helper(void *priv, @@ -911,37 +878,26 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) EXPORT_SYMBOL_GPL(nf_ct_netns_put); static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_tcp, + &nf_conntrack_l4proto_udp, &nf_conntrack_l4proto_icmp, #ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, + &nf_conntrack_l4proto_dccp, #endif #ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, + &nf_conntrack_l4proto_sctp, #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, + &nf_conntrack_l4proto_udplite, #endif #if IS_ENABLED(CONFIG_IPV6) - &nf_conntrack_l4proto_tcp6, - &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite6, -#endif #endif /* CONFIG_IPV6 */ }; int nf_conntrack_proto_init(void) { - int ret = 0; + int ret = 0, i; ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) @@ -952,6 +908,11 @@ int nf_conntrack_proto_init(void) if (ret < 0) goto cleanup_sockopt; #endif + + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) + RCU_INIT_POINTER(nf_ct_protos[i], + &nf_conntrack_l4proto_generic); + ret = nf_ct_l4proto_register(builtin_l4proto, ARRAY_SIZE(builtin_l4proto)); if (ret < 0) @@ -969,17 +930,10 @@ cleanup_sockopt: void nf_conntrack_proto_fini(void) { - unsigned int i; - nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6); #endif - /* No need to call nf_ct_l4proto_unregister(), the register - * tables are free'd here anyway. - */ - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - kfree(nf_ct_protos[i]); } int nf_conntrack_proto_pernet_init(struct net *net) @@ -988,8 +942,7 @@ int nf_conntrack_proto_pernet_init(struct net *net) struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); - err = nf_conntrack_l4proto_generic.init_net(net, - nf_conntrack_l4proto_generic.l3proto); + err = nf_conntrack_l4proto_generic.init_net(net); if (err < 0) return err; err = nf_ct_l4proto_register_sysctl(net, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index f3f91ed2c21a..023c1445bc39 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -384,27 +384,19 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; -static inline struct nf_dccp_net *dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} - -static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) +static noinline bool +dccp_new(struct nf_conn *ct, const struct sk_buff *skb, + const struct dccp_hdr *dh) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; - struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - BUG_ON(dh == NULL); - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: - dn = dccp_pernet(net); + dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; @@ -438,8 +430,51 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } -static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo) +static bool dccp_error(const struct dccp_hdr *dh, + struct sk_buff *skb, unsigned int dataoff, + const struct nf_hook_state *state) +{ + unsigned int dccp_len = skb->len - dataoff; + unsigned int cscov; + const char *msg; + + if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || + dh->dccph_doff * 4 > dccp_len) { + msg = "nf_ct_dccp: truncated/malformed packet "; + goto out_invalid; + } + + cscov = dccp_len; + if (dh->dccph_cscov) { + cscov = (dh->dccph_cscov - 1) * 4; + if (cscov > dccp_len) { + msg = "nf_ct_dccp: bad checksum coverage "; + goto out_invalid; + } + } + + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum_partial(skb, state->hook, dataoff, cscov, + IPPROTO_DCCP, state->pf)) { + msg = "nf_ct_dccp: bad checksum "; + goto out_invalid; + } + + if (dh->dccph_type >= DCCP_PKT_INVALID) { + msg = "nf_ct_dccp: reserved packet type "; + goto out_invalid; + } + return false; +out_invalid: + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_DCCP, "%s", msg); + return true; +} + +static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; @@ -448,8 +483,15 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int *timeouts; dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - BUG_ON(dh == NULL); + if (!dh) + return NF_DROP; + + if (dccp_error(dh, skb, dataoff, state)) + return -NF_ACCEPT; + type = dh->dccph_type; + if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) + return -NF_ACCEPT; if (type == DCCP_PKT_RESET && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { @@ -521,61 +563,12 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; + timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; } -static int dccp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u_int8_t pf, unsigned int hooknum) -{ - struct dccp_hdr _dh, *dh; - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - if (dh == NULL) { - msg = "nf_ct_dccp: short packet "; - goto out_invalid; - } - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, - pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - if (dh->dccph_type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - - return NF_ACCEPT; - -out_invalid: - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg); - return -NF_ACCEPT; -} - static bool dccp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.dccp.state) { @@ -683,7 +676,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; @@ -814,9 +807,9 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, return 0; } -static int dccp_init_net(struct net *net, u_int16_t proto) +static int dccp_init_net(struct net *net) { - struct nf_dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = nf_dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; if (!pn->users) { @@ -844,45 +837,9 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.dccp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { - .l3proto = AF_INET, - .l4proto = IPPROTO_DCCP, - .new = dccp_new, - .packet = dccp_packet, - .error = dccp_error, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = dccp_init_net, - .get_net_proto = dccp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { - .l3proto = AF_INET6, +const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .l4proto = IPPROTO_DCCP, - .new = dccp_new, .packet = dccp_packet, - .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, @@ -908,4 +865,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .init_net = dccp_init_net, .get_net_proto = dccp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 1df3244ecd07..5da19d5fbc76 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -27,11 +27,6 @@ static bool nf_generic_should_process(u8 proto) } } -static inline struct nf_generic_net *generic_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.generic; -} - static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -44,32 +39,26 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, /* Returns verdict for packet, or -1 for invalid. */ static int generic_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { const unsigned int *timeout = nf_ct_timeout_lookup(ct); + if (!nf_generic_should_process(nf_ct_protonum(ct))) { + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return -NF_ACCEPT; + } + if (!timeout) - timeout = &generic_pernet(nf_ct_net(ct))->timeout; + timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - bool ret; - - ret = nf_generic_should_process(nf_ct_protonum(ct)); - if (!ret) - pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", - nf_ct_protonum(ct)); - return ret; -} - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include @@ -78,7 +67,7 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) @@ -142,9 +131,9 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int generic_init_net(struct net *net, u_int16_t proto) +static int generic_init_net(struct net *net) { - struct nf_generic_net *gn = generic_pernet(net); + struct nf_generic_net *gn = nf_generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; @@ -159,11 +148,9 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { - .l3proto = PF_UNSPEC, .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, .packet = generic_packet, - .new = generic_new, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 650eb4fba2c5..9b48dc8b4b88 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -233,10 +233,26 @@ static unsigned int *gre_get_timeouts(struct net *net) /* Returns verdict for packet, and may modify conntrack */ static int gre_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { + if (state->pf != NFPROTO_IPV4) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct)) { + unsigned int *timeouts = nf_ct_timeout_lookup(ct); + + if (!timeouts) + timeouts = gre_get_timeouts(nf_ct_net(ct)); + + /* initialize to sane value. Ideally a conntrack helper + * (e.g. in case of pptp) is increasing them */ + ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; + ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; + } + /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ if (ct->status & IPS_SEEN_REPLY) { @@ -252,26 +268,6 @@ static int gre_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - unsigned int *timeouts = nf_ct_timeout_lookup(ct); - - if (!timeouts) - timeouts = gre_get_timeouts(nf_ct_net(ct)); - - pr_debug(": "); - nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; - ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; - - return true; -} - /* Called when a conntrack entry has already been removed from the hashes * and is about to be deleted from memory */ static void gre_destroy(struct nf_conn *ct) @@ -336,7 +332,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -static int gre_init_net(struct net *net, u_int16_t proto) +static int gre_init_net(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); int i; @@ -351,14 +347,12 @@ static int gre_init_net(struct net *net, u_int16_t proto) /* protocol helper struct */ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { - .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif .packet = gre_packet, - .new = gre_new, .destroy = gre_destroy, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 43c7e1a217b9..de64d8a5fdfd 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -25,11 +25,6 @@ static const unsigned int nf_ct_icmp_timeout = 30*HZ; -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { @@ -72,34 +67,17 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - /* Returns verdict for packet, or -1 for invalid. */ static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmp_get_timeouts(nf_ct_net(ct)); - - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, @@ -107,21 +85,29 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, [ICMP_ADDRESS] = 1 }; + if (state->pf != NFPROTO_IPV4) + return -NF_ACCEPT; + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; + return -NF_ACCEPT; } - return true; + + if (!timeout) + timeout = &nf_icmp_pernet(nf_ct_net(ct))->timeout; + + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + return NF_ACCEPT; } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) +icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, + const struct nf_hook_state *state) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; @@ -137,13 +123,13 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { + PF_INET, state->net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); + innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ @@ -154,7 +140,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, zone, &innertuple); + h = nf_conntrack_find_get(state->net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -168,17 +154,18 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void icmp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) +int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + const struct nf_hook_state *state) { const struct icmphdr *icmph; struct icmphdr _ih; @@ -186,14 +173,15 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); + icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); + if (state->net->ct.sysctl_checksum && + state->hook == NF_INET_PRE_ROUTING && + nf_ip_checksum(skb, state->hook, dataoff, 0)) { + icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -204,7 +192,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); + icmp_error_log(skb, state, "invalid icmp type"); return -NF_ACCEPT; } @@ -216,7 +204,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(net, tmpl, skb, hooknum); + return icmp_error_message(tmpl, skb, state); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -282,7 +270,7 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) @@ -342,9 +330,9 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmp_init_net(struct net *net, u_int16_t proto) +static int icmp_init_net(struct net *net) { - struct nf_icmp_net *in = icmp_pernet(net); + struct nf_icmp_net *in = nf_icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; @@ -359,13 +347,10 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { - .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, .destroy = NULL, .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 97e40f77d678..a15eefb8e317 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -30,11 +30,6 @@ static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, @@ -87,16 +82,36 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, static unsigned int *icmpv6_get_timeouts(struct net *net) { - return &icmpv6_pernet(net)->timeout; + return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ static int icmpv6_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); + static const u8 valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + + if (state->pf != NFPROTO_IPV6) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct)) { + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + return -NF_ACCEPT; + } + } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); @@ -109,26 +124,6 @@ static int icmpv6_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMPV6_ECHO_REQUEST - 128] = 1, - [ICMPV6_NI_QUERY - 128] = 1 - }; - int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; - - if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { - /* Can't create a new ICMPv6 `conn' with this. */ - pr_debug("icmpv6: can't create new conn with type %u\n", - type + 128); - nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - static int icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -153,7 +148,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, } /* rcu_read_lock()ed by nf_hook_thresh */ - inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + inproto = __nf_ct_l4proto_find(origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ @@ -179,16 +174,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } -static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void icmpv6_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_ICMPV6, "%s", msg); } -static int -icmpv6_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) +int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; @@ -196,13 +193,14 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - icmpv6_error_log(skb, net, pf, "short packet"); + icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { + icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } @@ -217,7 +215,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(net, tmpl, skb, dataoff); + return icmpv6_error_message(state->net, tmpl, skb, dataoff); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -283,7 +281,7 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); @@ -343,9 +341,9 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmpv6_init_net(struct net *net, u_int16_t proto) +static int icmpv6_init_net(struct net *net) { - struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_icmp_net *in = nf_icmpv6_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmpv6_timeout; @@ -360,13 +358,10 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { - .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .packet = icmpv6_packet, - .new = icmpv6_new, - .error = icmpv6_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index e4d738d34cd0..d53e3e78f605 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -146,11 +146,6 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static inline struct nf_sctp_net *sctp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.sctp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -273,11 +268,100 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } +/* Don't need lock here: this conntrack not in circulation yet */ +static noinline bool +sctp_new(struct nf_conn *ct, const struct sk_buff *skb, + const struct sctphdr *sh, unsigned int dataoff) +{ + enum sctp_conntrack new_state; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u32 offset, count; + + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); + new_state = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); + return false; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _inithdr, *ih; + /* Sec 8.5.1 (A) */ + if (sh->vtag) + return false; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), + sizeof(_inithdr), &_inithdr); + if (!ih) + return false; + + pr_debug("Setting vtag %x for new conn\n", + ih->init_tag); + + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; + } else { + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + pr_debug("Setting vtag %x for new conn OOTB\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + ct->proto.sctp.state = new_state; + } + + return true; +} + +static bool sctp_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + const struct sctphdr *sh; + const char *logmsg; + + if (skb->len < dataoff + sizeof(struct sctphdr)) { + logmsg = "nf_ct_sctp: short packet "; + goto out_invalid; + } + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + skb->ip_summed == CHECKSUM_NONE) { + if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + logmsg = "nf_ct_sctp: failed to read header "; + goto out_invalid; + } + sh = (const struct sctphdr *)(skb->data + dataoff); + if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { + logmsg = "nf_ct_sctp: bad CRC "; + goto out_invalid; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return false; +out_invalid: + nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg); + return true; +} + /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -289,6 +373,9 @@ static int sctp_packet(struct nf_conn *ct, unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + if (sctp_error(skb, dataoff, state)) + return -NF_ACCEPT; + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; @@ -296,6 +383,17 @@ static int sctp_packet(struct nf_conn *ct, if (do_basic_checks(ct, skb, dataoff, map) != 0) goto out; + if (!nf_ct_is_confirmed(ct)) { + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) + return -NF_ACCEPT; + + if (!sctp_new(ct, skb, sh, dataoff)) + return -NF_ACCEPT; + } + /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && @@ -377,7 +475,7 @@ static int sctp_packet(struct nf_conn *ct, timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) - timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); @@ -397,110 +495,6 @@ out: return -NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - enum sctp_conntrack new_state; - const struct sctphdr *sh; - struct sctphdr _sctph; - const struct sctp_chunkhdr *sch; - struct sctp_chunkhdr _sch; - u_int32_t offset, count; - unsigned long map[256 / sizeof(unsigned long)] = { 0 }; - - sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); - if (sh == NULL) - return false; - - if (do_basic_checks(ct, skb, dataoff, map) != 0) - return false; - - /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if (test_bit(SCTP_CID_ABORT, map) || - test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || - test_bit(SCTP_CID_COOKIE_ACK, map)) - return false; - - memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); - new_state = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - /* Don't need lock here: this conntrack not in circulation yet */ - new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); - - /* Invalid: delete conntrack */ - if (new_state == SCTP_CONNTRACK_NONE || - new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); - return false; - } - - /* Copy the vtag into the state info */ - if (sch->type == SCTP_CID_INIT) { - struct sctp_inithdr _inithdr, *ih; - /* Sec 8.5.1 (A) */ - if (sh->vtag) - return false; - - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (!ih) - return false; - - pr_debug("Setting vtag %x for new conn\n", - ih->init_tag); - - ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT) { - pr_debug("Setting vtag %x for secondary conntrack\n", - sh->vtag); - ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } - /* If it is a shutdown ack OOTB packet, we expect a return - shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ - else { - pr_debug("Setting vtag %x for new conn OOTB\n", - sh->vtag); - ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; - } - - ct->proto.sctp.state = new_state; - } - - return true; -} - -static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, - unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct sctphdr *sh; - const char *logmsg; - - if (skb->len < dataoff + sizeof(struct sctphdr)) { - logmsg = "nf_ct_sctp: short packet "; - goto out_invalid; - } - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { - logmsg = "nf_ct_sctp: failed to read header "; - goto out_invalid; - } - sh = (const struct sctphdr *)(skb->data + dataoff); - if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { - logmsg = "nf_ct_sctp: bad CRC "; - goto out_invalid; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return NF_ACCEPT; -out_invalid: - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg); - return -NF_ACCEPT; -} - static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { @@ -600,7 +594,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; /* set default SCTP timeouts. */ @@ -735,9 +729,9 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int sctp_init_net(struct net *net, u_int16_t proto) +static int sctp_init_net(struct net *net) { - struct nf_sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = nf_sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; if (!pn->users) { @@ -760,49 +754,12 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.sctp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { - .l3proto = PF_INET, - .l4proto = IPPROTO_SCTP, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = sctp_print_conntrack, -#endif - .packet = sctp_packet, - .new = sctp_new, - .error = sctp_error, - .can_early_drop = sctp_can_early_drop, - .me = THIS_MODULE, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = SCTP_NLATTR_SIZE, - .to_nlattr = sctp_to_nlattr, - .from_nlattr = nlattr_to_sctp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = sctp_timeout_nlattr_to_obj, - .obj_to_nlattr = sctp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_SCTP_MAX, - .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, - .nla_policy = sctp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = sctp_init_net, - .get_net_proto = sctp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { - .l3proto = PF_INET6, +const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .new = sctp_new, - .error = sctp_error, .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -826,4 +783,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .init_net = sctp_init_net, .get_net_proto = sctp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b4bdf9eda7b7..4dcbd51a8e97 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -272,11 +272,6 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { } }; -static inline struct nf_tcp_net *tcp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.tcp; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -475,7 +470,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -717,35 +712,26 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| [TCPHDR_ACK|TCPHDR_URG] = 1, }; -static void tcp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void tcp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int dataoff, - u_int8_t pf, - unsigned int hooknum) +static bool tcp_error(const struct tcphdr *th, + struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { - const struct tcphdr *th; - struct tcphdr _tcph; unsigned int tcplen = skb->len - dataoff; - u_int8_t tcpflags; - - /* Smaller that minimal TCP header? */ - th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - if (th == NULL) { - tcp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } + u8 tcpflags; /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - tcp_error_log(skb, net, pf, "truncated packet"); - return -NF_ACCEPT; + tcp_error_log(skb, state, "truncated packet"); + return true; } /* Checksum invalid? Ignore. @@ -753,30 +739,104 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - tcp_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + if (state->net->ct.sysctl_checksum && + state->hook == NF_INET_PRE_ROUTING && + nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { + tcp_error_log(skb, state, "bad checksum"); + return true; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - tcp_error_log(skb, net, pf, "invalid tcp flag combination"); - return -NF_ACCEPT; + tcp_error_log(skb, state, "invalid tcp flag combination"); + return true; } - return NF_ACCEPT; + return false; +} + +static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *th) +{ + enum tcp_conntrack new_state; + struct net *net = nf_ct_net(ct); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + pr_debug("nf_ct_tcp: invalid new deleting.\n"); + return false; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* SYN packet */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + } else if (tn->tcp_loose == 0) { + /* Don't try to pick up connections. */ + return false; + } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; + + /* We assume SACK and liberal window checking to handle + * window scaling */ + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; + } + + /* tcp_packet will set them */ + ct->proto.tcp.last_index = TCP_NONE_SET; + + pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + __func__, + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return true; } /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; @@ -786,7 +846,14 @@ static int tcp_packet(struct nf_conn *ct, unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); + if (th == NULL) + return -NF_ACCEPT; + + if (tcp_error(th, skb, dataoff, state)) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) + return -NF_ACCEPT; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; @@ -1067,82 +1134,6 @@ static int tcp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - enum tcp_conntrack new_state; - const struct tcphdr *th; - struct tcphdr _tcph; - struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); - const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; - const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; - - th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - /* Don't need lock here: this conntrack not in circulation yet */ - new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; - - /* Invalid: delete conntrack */ - if (new_state >= TCP_CONNTRACK_MAX) { - pr_debug("nf_ct_tcp: invalid new deleting.\n"); - return false; - } - - if (new_state == TCP_CONNTRACK_SYN_SENT) { - memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); - /* SYN packet */ - ct->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - dataoff, th); - ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (ct->proto.tcp.seen[0].td_maxwin == 0) - ct->proto.tcp.seen[0].td_maxwin = 1; - ct->proto.tcp.seen[0].td_maxend = - ct->proto.tcp.seen[0].td_end; - - tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); - } else if (tn->tcp_loose == 0) { - /* Don't try to pick up connections. */ - return false; - } else { - memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - ct->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - dataoff, th); - ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (ct->proto.tcp.seen[0].td_maxwin == 0) - ct->proto.tcp.seen[0].td_maxwin = 1; - ct->proto.tcp.seen[0].td_maxend = - ct->proto.tcp.seen[0].td_end + - ct->proto.tcp.seen[0].td_maxwin; - - /* We assume SACK and liberal window checking to handle - * window scaling */ - ct->proto.tcp.seen[0].flags = - ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; - } - - /* tcp_packet will set them */ - ct->proto.tcp.last_index = TCP_NONE_SET; - - pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - return true; -} - static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { @@ -1213,8 +1204,8 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags)))) + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { @@ -1287,7 +1278,7 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; @@ -1510,9 +1501,9 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int tcp_init_net(struct net *net, u_int16_t proto) +static int tcp_init_net(struct net *net) { - struct nf_tcp_net *tn = tcp_pernet(net); + struct nf_tcp_net *tn = nf_tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; if (!pn->users) { @@ -1538,16 +1529,13 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.tcp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = { - .l3proto = PF_INET, .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, @@ -1571,39 +1559,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_TCP, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = tcp_print_conntrack, -#endif - .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, - .can_early_drop = tcp_can_early_drop, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = TCP_NLATTR_SIZE, - .to_nlattr = tcp_to_nlattr, - .from_nlattr = nlattr_to_tcp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = tcp_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = tcp_timeout_nlattr_to_obj, - .obj_to_nlattr = tcp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_TCP_MAX, - .obj_size = sizeof(unsigned int) * - TCP_CONNTRACK_TIMEOUT_MAX, - .nla_policy = tcp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = tcp_init_net, - .get_net_proto = tcp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3065fb8ef91b..c879d8d78cfd 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -32,24 +32,70 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_REPLIED] = 180*HZ, }; -static inline struct nf_udp_net *udp_pernet(struct net *net) +static unsigned int *udp_get_timeouts(struct net *net) { - return &net->ct.nf_ct_proto.udp; + return nf_udp_pernet(net)->timeouts; } -static unsigned int *udp_get_timeouts(struct net *net) +static void udp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) +{ + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_UDP, "%s", msg); +} + +static bool udp_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { - return udp_pernet(net)->timeouts; + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (!hdr) { + udp_error_log(skb, state, "short packet"); + return true; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + udp_error_log(skb, state, "truncated/malformed packet"); + return true; + } + + /* Packet with no checksum */ + if (!hdr->check) + return false; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + * FIXME: Source route IP option packets --RR */ + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { + udp_error_log(skb, state, "bad checksum"); + return true; + } + + return false; } /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeouts; + if (udp_error(skb, dataoff, state)) + return -NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); @@ -69,24 +115,18 @@ static int udp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - return true; -} - #ifdef CONFIG_NF_CT_PROTO_UDPLITE -static void udplite_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void udplite_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_UDPLITE, "%s", msg); } -static int udplite_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int dataoff, - u8 pf, unsigned int hooknum) +static bool udplite_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; @@ -96,80 +136,67 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { - udplite_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "short packet"); + return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { - udplite_error_log(skb, net, pf, "invalid checksum coverage"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "invalid checksum coverage"); + return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { - udplite_error_log(skb, net, pf, "checksum missing"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "checksum missing"); + return true; } /* Checksum invalid? Ignore. */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, - pf)) { - udplite_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, + state->pf)) { + udplite_error_log(skb, state, "bad checksum"); + return true; } - return NF_ACCEPT; -} -#endif - -static void udp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg); + return false; } -static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int dataoff, - u_int8_t pf, - unsigned int hooknum) +/* Returns verdict for packet, and may modify conntracktype */ +static int udplite_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { - unsigned int udplen = skb->len - dataoff; - const struct udphdr *hdr; - struct udphdr _hdr; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hdr == NULL) { - udp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } + unsigned int *timeouts; - /* Truncated/malformed packets */ - if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - udp_error_log(skb, net, pf, "truncated/malformed packet"); + if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; - } - /* Packet with no checksum */ - if (!hdr->check) - return NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = udp_get_timeouts(nf_ct_net(ct)); - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because the checksum is assumed to be correct. - * FIXME: Source route IP option packets --RR */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - udp_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_REPLIED]); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_UNREPLIED]); } - return NF_ACCEPT; } +#endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT @@ -180,7 +207,7 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; @@ -258,9 +285,9 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int udp_init_net(struct net *net, u_int16_t proto) +static int udp_init_net(struct net *net) { - struct nf_udp_net *un = udp_pernet(net); + struct nf_udp_net *un = nf_udp_pernet(net); struct nf_proto_net *pn = &un->pn; if (!pn->users) { @@ -278,14 +305,11 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.udp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { - .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .new = udp_new, - .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -304,75 +328,13 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); #ifdef CONFIG_NF_CT_PROTO_UDPLITE -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { - .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udplite_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = udp_timeout_nlattr_to_obj, - .obj_to_nlattr = udp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDP_MAX, - .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, - .nla_policy = udp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); -#endif - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_UDP, - .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udp_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = udp_timeout_nlattr_to_obj, - .obj_to_nlattr = udp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDP_MAX, - .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, - .nla_policy = udp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); - -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_UDPLITE, - .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udplite_error, + .packet = udplite_packet, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -391,5 +353,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 13279f683da9..463d17d349c1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -292,7 +292,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); WARN_ON(!l4proto); ret = -ENOSPC; @@ -720,10 +720,3 @@ static void __exit nf_conntrack_standalone_fini(void) module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini); - -/* Some modules need us, but don't depend directly on any symbol. - They should call this. */ -void need_conntrack(void) -{ -} -EXPORT_SYMBOL_GPL(need_conntrack); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index d8125616edc7..b7a4816add76 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -120,7 +120,7 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) if (l4num == IPPROTO_TCP) flow_offload_fixup_tcp(&ct->proto.tcp); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num); + l4proto = __nf_ct_l4proto_find(l4num); if (!l4proto) return; @@ -233,8 +233,8 @@ flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload *flow; int dir; - tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple, - nf_flow_offload_rhash_params); + tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, + nf_flow_offload_rhash_params); if (!tuplehash) return NULL; @@ -254,20 +254,17 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; - int err; - - err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); - if (err) - return err; + int err = 0; + rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { - err = PTR_ERR(tuplehash); - if (err != -EAGAIN) - goto out; - + if (PTR_ERR(tuplehash) != -EAGAIN) { + err = PTR_ERR(tuplehash); + break; + } continue; } if (tuplehash->tuple.dir) @@ -277,7 +274,6 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, iter(flow, data); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); @@ -290,25 +286,19 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) +static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; - int err; - - err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); - if (err) - return 0; + rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { - err = PTR_ERR(tuplehash); - if (err != -EAGAIN) - goto out; - + if (PTR_ERR(tuplehash) != -EAGAIN) + break; continue; } if (tuplehash->tuple.dir) @@ -321,11 +311,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - - return 1; } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -478,14 +465,17 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; + struct flow_offload_entry *e; + + e = container_of(flow, struct flow_offload_entry, flow); if (!dev) { flow_offload_teardown(flow); return; } - - if (flow->tuplehash[0].tuple.iifidx == dev->ifindex || - flow->tuplehash[1].tuple.iifidx == dev->ifindex) + if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && + (flow->tuplehash[0].tuple.iifidx == dev->ifindex || + flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_dead(flow); } @@ -496,7 +486,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, flush_delayed_work(&flowtable->gc_work); } -void nf_flow_table_cleanup(struct net *net, struct net_device *dev) +void nf_flow_table_cleanup(struct net_device *dev) { struct nf_flowtable *flowtable; @@ -514,7 +504,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); - WARN_ON(!nf_flow_offload_gc_step(flow_table)); + nf_flow_offload_gc_step(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 15ed91309992..1d291a51cd45 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -254,8 +254,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ip(flow, skb, thoff, dir) < 0) + if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; @@ -471,8 +470,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (skb_try_make_writable(skb, sizeof(*ip6h))) return NF_DROP; - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ipv6(flow, skb, dir) < 0) + if (nf_flow_nat_ipv6(flow, skb, dir) < 0) return NF_DROP; flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 99606baedda4..38793b95d9bc 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -37,7 +37,7 @@ static void mangle_contents(struct sk_buff *skb, { unsigned char *data; - BUG_ON(skb_is_nonlinear(skb)); + SKB_LINEAR_ASSERT(skb); data = skb_network_header(skb) + dataoff; /* move post-replacement */ @@ -110,8 +110,6 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, !enlarge_skb(skb, rep_len - match_len)) return false; - SKB_LINEAR_ASSERT(skb); - tcph = (void *)skb->data + protoff; oldlen = skb->len - protoff; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index adee04af8d43..78a9e6454ff3 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -52,13 +52,11 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, newdst = 0; - rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); if (indev && indev->ifa_list) { ifa = indev->ifa_list; newdst = ifa->ifa_local; } - rcu_read_unlock(); if (!newdst) return NF_DROP; @@ -97,7 +95,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, struct inet6_ifaddr *ifa; bool addr = false; - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (idev != NULL) { read_lock_bh(&idev->lock); @@ -108,7 +105,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } read_unlock_bh(&idev->lock); } - rcu_read_unlock(); if (!addr) return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2cfb173cd0b2..42487d01a3ed 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -27,6 +27,8 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); +static LIST_HEAD(nf_tables_destroy_list); +static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static u64 table_handle; enum { @@ -64,6 +66,8 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) net->nft.validate_state = new_validate_state; } +static void nf_tables_trans_destroy_work(struct work_struct *w); +static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, @@ -207,6 +211,18 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } +/* either expr ops provide both activate/deactivate, or neither */ +static bool nft_expr_check_ops(const struct nft_expr_ops *ops) +{ + if (!ops) + return true; + + if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate))) + return false; + + return true; +} + static void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { @@ -298,7 +314,7 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { struct nft_trans *trans; @@ -318,7 +334,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, return 0; } -static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) +static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -1005,7 +1021,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, static void nf_tables_table_destroy(struct nft_ctx *ctx) { - BUG_ON(ctx->table->use > 0); + if (WARN_ON(ctx->table->use > 0)) + return; rhltable_destroy(&ctx->table->chains_ht); kfree(ctx->table->name); @@ -1412,7 +1429,8 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; - BUG_ON(chain->use > 0); + if (WARN_ON(chain->use > 0)) + return; /* no concurrent access possible anymore */ nf_tables_chain_free_chain_rules(chain); @@ -1907,6 +1925,9 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, */ int nft_register_expr(struct nft_expr_type *type) { + if (!nft_expr_check_ops(type->ops)) + return -EINVAL; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -2054,6 +2075,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); goto err1; } + if (!nft_expr_check_ops(ops)) { + err = -EINVAL; + goto err1; + } } else ops = type->ops; @@ -2434,7 +2459,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, { struct nft_expr *expr; - lockdep_assert_held(&ctx->net->nft.commit_mutex); /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). @@ -3567,13 +3591,6 @@ static void nft_set_destroy(struct nft_set *set) kvfree(set); } -static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) -{ - list_del_rcu(&set->list); - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); - nft_set_destroy(set); -} - static int nf_tables_delset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3668,17 +3685,38 @@ bind: } EXPORT_SYMBOL_GPL(nf_tables_bind_set); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, +void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) +{ + if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && + nft_is_active(ctx->net, set)) + list_add_tail_rcu(&set->list, &ctx->table->sets); + + list_add_tail_rcu(&binding->list, &set->bindings); +} +EXPORT_SYMBOL_GPL(nf_tables_rebind_set); + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) { list_del_rcu(&binding->list); if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && nft_is_active(ctx->net, set)) - nf_tables_set_destroy(ctx, set); + list_del_rcu(&set->list); } EXPORT_SYMBOL_GPL(nf_tables_unbind_set); +void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && + nft_is_active(ctx->net, set)) { + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); + } +} +EXPORT_SYMBOL_GPL(nf_tables_destroy_set); + const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { .align = __alignof__(u32), @@ -6191,19 +6229,28 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } + + if (trans->put_net) + put_net(trans->ctx.net); + kfree(trans); } -static void nf_tables_commit_release(struct net *net) +static void nf_tables_trans_destroy_work(struct work_struct *w) { struct nft_trans *trans, *next; + LIST_HEAD(head); - if (list_empty(&net->nft.commit_list)) + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_destroy_list, &head); + spin_unlock(&nf_tables_destroy_list_lock); + + if (list_empty(&head)) return; synchronize_rcu(); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &head, list) { list_del(&trans->list); nft_commit_release(trans); } @@ -6334,6 +6381,37 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_commit_release(struct net *net) +{ + struct nft_trans *trans; + + /* all side effects have to be made visible. + * For example, if a chain named 'foo' has been deleted, a + * new transaction must not find it anymore. + * + * Memory reclaim happens asynchronously from work queue + * to prevent expensive synchronize_rcu() in commit phase. + */ + if (list_empty(&net->nft.commit_list)) { + mutex_unlock(&net->nft.commit_mutex); + return; + } + + trans = list_last_entry(&net->nft.commit_list, + struct nft_trans, list); + get_net(trans->ctx.net); + WARN_ON_ONCE(trans->put_net); + + trans->put_net = true; + spin_lock(&nf_tables_destroy_list_lock); + list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + spin_unlock(&nf_tables_destroy_list_lock); + + mutex_unlock(&net->nft.commit_mutex); + + schedule_work(&trans_destroy_work); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; @@ -6495,9 +6573,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } - nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - mutex_unlock(&net->nft.commit_mutex); + nf_tables_commit_release(net); return 0; } @@ -7168,7 +7245,8 @@ int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - BUG_ON(!nft_is_base_chain(ctx->chain)); + if (WARN_ON(!nft_is_base_chain(ctx->chain))) + return 0; nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { @@ -7202,9 +7280,6 @@ static void __nft_release_tables(struct net *net) list_for_each_entry(chain, &table->chains, list) nf_tables_unregister_hook(net, table, chain); - list_for_each_entry(flowtable, &table->flowtables, list) - nf_unregister_net_hooks(net, flowtable->ops, - flowtable->ops_len); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { @@ -7271,6 +7346,7 @@ static int __init nf_tables_module_init(void) { int err; + spin_lock_init(&nf_tables_destroy_list_lock); err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; @@ -7310,6 +7386,7 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_destroy_work); rcu_barrier(); nf_tables_core_module_exit(); } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index ffd5c0f9412b..3fbce3b9c5ec 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -249,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_exthdr_type, }; +static struct nft_object_type *nft_basic_objects[] = { +#ifdef CONFIG_NETWORK_SECMARK + &nft_secmark_obj_type, +#endif +}; + int __init nf_tables_core_module_init(void) { - int err, i; + int err, i, j = 0; + + for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { + err = nft_register_obj(nft_basic_objects[i]); + if (err) + goto err; + } - for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) { - err = nft_register_expr(nft_basic_types[i]); + for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) { + err = nft_register_expr(nft_basic_types[j]); if (err) goto err; } @@ -262,8 +274,12 @@ int __init nf_tables_core_module_init(void) return 0; err: + while (j-- > 0) + nft_unregister_expr(nft_basic_types[j]); + while (i-- > 0) - nft_unregister_expr(nft_basic_types[i]); + nft_unregister_obj(nft_basic_objects[i]); + return err; } @@ -274,4 +290,8 @@ void nf_tables_core_module_exit(void) i = ARRAY_SIZE(nft_basic_types); while (i-- > 0) nft_unregister_expr(nft_basic_types[i]); + + i = ARRAY_SIZE(nft_basic_objects); + while (i-- > 0) + nft_unregister_obj(nft_basic_objects[i]); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a30f8ba4b89a..a518eb162344 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -53,9 +53,6 @@ ctnl_timeout_parse_policy(void *timeout, struct nlattr **tb; int ret = 0; - if (!l4proto->ctnl_timeout.nlattr_to_obj) - return 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); @@ -125,7 +122,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return -EBUSY; } - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { @@ -167,6 +164,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; + struct nlattr *nest_parms; + int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -186,22 +185,15 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { - struct nlattr *nest_parms; - int ret; - - nest_parms = nla_nest_start(skb, - CTA_TIMEOUT_DATA | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, - &timeout->timeout.data); - if (ret < 0) - goto nla_put_failure; + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); + if (ret < 0) + goto nla_put_failure; - nla_nest_end(skb, nest_parms); - } + nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; @@ -358,7 +350,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; - __u16 l3num; __u8 l4num; int ret; @@ -367,9 +358,8 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, !cda[CTA_TIMEOUT_DATA]) return -EINVAL; - l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { @@ -391,12 +381,15 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, - u32 seq, u32 type, int event, - const struct nf_conntrack_l4proto *l4proto) + u32 seq, u32 type, int event, u16 l3num, + const struct nf_conntrack_l4proto *l4proto, + const unsigned int *timeouts) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; + struct nlattr *nest_parms; + int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -408,25 +401,19 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || + if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { - struct nlattr *nest_parms; - int ret; - - nest_parms = nla_nest_start(skb, - CTA_TIMEOUT_DATA | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); - if (ret < 0) - goto nla_put_failure; + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + if (ret < 0) + goto nla_put_failure; - nla_nest_end(skb, nest_parms); - } + nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; @@ -444,6 +431,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts = NULL; struct sk_buff *skb2; int ret, err; __u16 l3num; @@ -454,14 +442,46 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); - /* This protocol is not supported, skip. */ - if (l4proto->l4proto != l4num) { - err = -EOPNOTSUPP; + err = -EOPNOTSUPP; + if (l4proto->l4proto != l4num) goto err; + + switch (l4proto->l4proto) { + case IPPROTO_ICMP: + timeouts = &nf_icmp_pernet(net)->timeout; + break; + case IPPROTO_TCP: + timeouts = nf_tcp_pernet(net)->timeouts; + break; + case IPPROTO_UDP: + timeouts = nf_udp_pernet(net)->timeouts; + break; + case IPPROTO_DCCP: +#ifdef CONFIG_NF_CT_PROTO_DCCP + timeouts = nf_dccp_pernet(net)->dccp_timeout; +#endif + break; + case IPPROTO_ICMPV6: + timeouts = &nf_icmpv6_pernet(net)->timeout; + break; + case IPPROTO_SCTP: +#ifdef CONFIG_NF_CT_PROTO_SCTP + timeouts = nf_sctp_pernet(net)->timeouts; +#endif + break; + case 255: + timeouts = &nf_generic_pernet(net)->timeout; + break; + default: + WARN_ON_ONCE(1); + break; } + if (!timeouts) + goto err; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { err = -ENOMEM; @@ -472,7 +492,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, - l4proto); + l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); err = -ENOMEM; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 00db27dfd2ff..6f41dd74729d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -30,32 +30,27 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); - - if (ttl_check != -1) { - if (ttl_check == NF_OSF_TTL_TRUE) - return ip->ttl == f_ttl; - if (ttl_check == NF_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) - return 1; - else { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); - int ret = 0; - - for_ifa(in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } - } - endfor_ifa(in_dev); - - return ret; + int ret = 0; + + if (ttl_check == NF_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (ttl_check == NF_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; } } - return ip->ttl == f_ttl; + endfor_ifa(in_dev); + + return ret; } struct nf_osf_hdr_ctx { @@ -213,7 +208,7 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, if (!tcp) return false; - ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; + ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { @@ -257,7 +252,8 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, EXPORT_SYMBOL_GPL(nf_osf_match); const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers) + const struct list_head *nf_osf_fingers, + const int ttl_check) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; @@ -275,7 +271,7 @@ const char *nf_osf_find(const struct sk_buff *skb, list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; - if (!nf_osf_match_one(skb, f, -1, &ctx)) + if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; genre = f->genre; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d33094f4ec41..43041f087eb3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -765,7 +765,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, return ret; } - skb->next = NULL; + skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index fa90a8402845..79d48c1d06f4 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -79,7 +79,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, tb[NFTA_CMP_DATA]); - BUG_ON(err < 0); + if (err < 0) + return err; priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); @@ -129,7 +130,8 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, err = nft_data_init(NULL, &data, sizeof(data), &desc, tb[NFTA_CMP_DATA]); - BUG_ON(err < 0); + if (err < 0) + return err; priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 32535eea51b2..9d0ede474224 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -54,9 +54,11 @@ static bool nft_xt_put(struct nft_xt *xt) return false; } -static int nft_compat_chain_validate_dependency(const char *tablename, - const struct nft_chain *chain) +static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, + const char *tablename) { + enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; + const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || @@ -64,9 +66,12 @@ static int nft_compat_chain_validate_dependency(const char *tablename, return 0; basechain = nft_base_chain(chain); - if (strcmp(tablename, "nat") == 0 && - basechain->type->type != NFT_CHAIN_T_NAT) - return -EINVAL; + if (strcmp(tablename, "nat") == 0) { + if (ctx->family != NFPROTO_BRIDGE) + type = NFT_CHAIN_T_NAT; + if (basechain->type->type != type) + return -EINVAL; + } return 0; } @@ -290,6 +295,24 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) module_put(target->me); } +static int nft_extension_dump_info(struct sk_buff *skb, int attr, + const void *info, + unsigned int size, unsigned int user_size) +{ + unsigned int info_size, aligned_size = XT_ALIGN(size); + struct nlattr *nla; + + nla = nla_reserve(skb, attr, aligned_size); + if (!nla) + return -1; + + info_size = user_size ? : size; + memcpy(nla_data(nla), info, info_size); + memset(nla_data(nla) + info_size, 0, aligned_size - info_size); + + return 0; +} + static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct xt_target *target = expr->ops->data; @@ -297,7 +320,8 @@ static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || - nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info)) + nft_extension_dump_info(skb, NFTA_TARGET_INFO, info, + target->targetsize, target->usersize)) goto nla_put_failure; return 0; @@ -323,8 +347,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(target->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } @@ -532,7 +555,8 @@ static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || - nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info)) + nft_extension_dump_info(skb, NFTA_MATCH_INFO, info, + match->matchsize, match->usersize)) goto nla_put_failure; return 0; @@ -570,8 +594,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->table, - ctx->chain); + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 5dd87748afa8..586627c361df 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -279,7 +279,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; -#ifdef CONFIG_NF_CONNTRACK_MARK +#if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; @@ -298,6 +298,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + if (ct->secmark != value) { + ct->secmark = value; + nf_conntrack_event_cache(IPCT_SECMARK, ct); + } + break; +#endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, @@ -564,6 +572,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, return -EINVAL; len = sizeof(u32); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); + break; #endif default: return -EOPNOTSUPP; @@ -776,9 +791,6 @@ nft_ct_timeout_parse_policy(void *timeouts, struct nlattr **tb; int ret = 0; - if (!l4proto->ctnl_timeout.nlattr_to_obj) - return 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); @@ -858,7 +870,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 2cc1e0ef56e8..15cc62b293d6 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -46,8 +46,6 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg_dev, sizeof(int)); } -static const struct nft_expr_ops nft_dup_netdev_ingress_ops; - static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_netdev *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6e91a37d57f2..07d4efd3d851 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -235,14 +235,31 @@ err1: return err; } +static void nft_dynset_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_dynset_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); if (priv->expr != NULL) nft_expr_destroy(ctx, priv->expr); + + nf_tables_destroy_set(ctx, priv->set); } static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -279,6 +296,8 @@ static const struct nft_expr_ops nft_dynset_ops = { .eval = nft_dynset_eval, .init = nft_dynset_init, .destroy = nft_dynset_destroy, + .activate = nft_dynset_activate, + .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, }; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index d6bab8c3cbb0..e82d9a966c45 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -201,7 +201,7 @@ static int flow_offload_netdev_event(struct notifier_block *this, if (event != NETDEV_DOWN) return NOTIFY_DONE; - nf_flow_table_cleanup(dev_net(dev), dev); + nf_flow_table_cleanup(dev); return NOTIFY_DONE; } diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 8abb9891cdf2..d7694e7255a0 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -53,8 +53,6 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg_dev, sizeof(int)); } -static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; - static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); @@ -169,8 +167,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg_addr, addr_len); } -static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; - static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index ad13e8643599..227b2b15a19c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -121,12 +121,28 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return 0; } +static void nft_lookup_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_lookup_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_lookup_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + nf_tables_destroy_set(ctx, priv->set); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -209,6 +225,8 @@ static const struct nft_expr_ops nft_lookup_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, + .activate = nft_lookup_activate, + .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 297fe7d97c18..6180626c3f80 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -284,6 +284,11 @@ static void nft_meta_set_eval(const struct nft_expr *expr, skb->nf_trace = !!value8; break; +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: + skb->secmark = value; + break; +#endif default: WARN_ON(1); } @@ -436,6 +441,9 @@ static int nft_meta_set_init(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_MARK: case NFT_META_PRIORITY: +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: +#endif len = sizeof(u32); break; case NFT_META_NFTRACE: @@ -543,3 +551,111 @@ struct nft_expr_type nft_meta_type __read_mostly = { .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; + +#ifdef CONFIG_NETWORK_SECMARK +struct nft_secmark { + u32 secid; + char *ctx; +}; + +static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = { + [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN }, +}; + +static int nft_secmark_compute_secid(struct nft_secmark *priv) +{ + u32 tmp_secid = 0; + int err; + + err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid); + if (err) + return err; + + if (!tmp_secid) + return -ENOENT; + + err = security_secmark_relabel_packet(tmp_secid); + if (err) + return err; + + priv->secid = tmp_secid; + return 0; +} + +static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_secmark *priv = nft_obj_data(obj); + struct sk_buff *skb = pkt->skb; + + skb->secmark = priv->secid; +} + +static int nft_secmark_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (tb[NFTA_SECMARK_CTX] == NULL) + return -EINVAL; + + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + if (!priv->ctx) + return -ENOMEM; + + err = nft_secmark_compute_secid(priv); + if (err) { + kfree(priv->ctx); + return err; + } + + security_secmark_refcount_inc(); + + return 0; +} + +static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj, + bool reset) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx)) + return -1; + + if (reset) { + err = nft_secmark_compute_secid(priv); + if (err) + return err; + } + + return 0; +} + +static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + + security_secmark_refcount_dec(); + + kfree(priv->ctx); +} + +static const struct nft_object_ops nft_secmark_obj_ops = { + .type = &nft_secmark_obj_type, + .size = sizeof(struct nft_secmark), + .init = nft_secmark_obj_init, + .eval = nft_secmark_obj_eval, + .dump = nft_secmark_obj_dump, + .destroy = nft_secmark_obj_destroy, +}; +struct nft_object_type nft_secmark_obj_type __read_mostly = { + .type = NFT_OBJECT_SECMARK, + .ops = &nft_secmark_obj_ops, + .maxattr = NFTA_SECMARK_MAX, + .policy = nft_secmark_policy, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_NETWORK_SECMARK */ diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 649d1700ec5b..3cc1b3dc3c3c 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,7 +24,6 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; - struct nft_set *map; }; static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) @@ -48,34 +47,11 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_inc_gen(priv); } -static void nft_ng_inc_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_inc_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, - [NFTA_NG_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -101,22 +77,6 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_inc_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_inc *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_inc_init(ctx, expr, tb); - - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -143,27 +103,10 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_inc_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_inc *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_INCREMENTAL, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static u32 nft_ng_random_gen(struct nft_ng_random *priv) @@ -183,25 +126,6 @@ static void nft_ng_random_eval(const struct nft_expr *expr, regs->data[priv->dreg] = nft_ng_random_gen(priv); } -static void nft_ng_random_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = nft_ng_random_gen(priv); - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static int nft_ng_random_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -226,21 +150,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_ng_random *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_ng_random_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_NG_SET_NAME], - tb[NFTA_NG_SET_ID], genmask); - - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -249,22 +158,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } -static int nft_ng_random_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_ng_random *priv = nft_expr_priv(expr); - - if (nft_ng_dump(skb, priv->dreg, priv->modulus, - NFT_NG_RANDOM, priv->offset) || - nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -274,14 +167,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; -static const struct nft_expr_ops nft_ng_inc_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), - .eval = nft_ng_inc_map_eval, - .init = nft_ng_inc_map_init, - .dump = nft_ng_inc_map_dump, -}; - static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -290,14 +175,6 @@ static const struct nft_expr_ops nft_ng_random_ops = { .dump = nft_ng_random_dump, }; -static const struct nft_expr_ops nft_ng_random_map_ops = { - .type = &nft_ng_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), - .eval = nft_ng_random_map_eval, - .init = nft_ng_random_map_init, - .dump = nft_ng_random_map_dump, -}; - static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { @@ -312,12 +189,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: - if (tb[NFTA_NG_SET_NAME]) - return &nft_ng_random_map_ops; return &nft_ng_random_ops; } diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index cdf348f751ec..a3185ca2a3a9 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -155,12 +155,28 @@ nla_put_failure: return -1; } +static void nft_objref_map_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_objref_map_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_objref_map_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + nf_tables_destroy_set(ctx, priv->set); } static struct nft_expr_type nft_objref_type; @@ -169,6 +185,8 @@ static const struct nft_expr_ops nft_objref_map_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, + .activate = nft_objref_map_activate, + .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, }; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 5af74b37f423..b13618c764ec 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,10 +6,12 @@ struct nft_osf { enum nft_registers dreg:8; + u8 ttl; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, + [NFTA_OSF_TTL] = { .type = NLA_U8 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -33,7 +35,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - os_name = nf_osf_find(skb, nf_osf_fingers); + os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl); if (!os_name) strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); else @@ -46,10 +48,18 @@ static int nft_osf_init(const struct nft_ctx *ctx, { struct nft_osf *priv = nft_expr_priv(expr); int err; + u8 ttl; + + if (tb[NFTA_OSF_TTL]) { + ttl = nla_get_u8(tb[NFTA_OSF_TTL]); + if (ttl > 2) + return -EINVAL; + priv->ttl = ttl; + } priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); + NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); if (err < 0) return err; @@ -60,6 +70,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_osf *priv = nft_expr_priv(expr); + if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; @@ -69,6 +82,15 @@ nla_put_failure: return -1; } +static int nft_osf_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD)); +} + static struct nft_expr_type nft_osf_type; static const struct nft_expr_ops nft_osf_op = { .eval = nft_osf_eval, @@ -76,6 +98,7 @@ static const struct nft_expr_ops nft_osf_op = { .init = nft_osf_init, .dump = nft_osf_dump, .type = &nft_osf_type, + .validate = nft_osf_validate, }; static struct nft_expr_type nft_osf_type __read_mostly = { diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 29f5bd2377b0..b48e58cceeb7 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -94,7 +94,8 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { int nft_reject_icmp_code(u8 code) { - BUG_ON(code > NFT_REJECT_ICMPX_MAX); + if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) + return ICMP_NET_UNREACH; return icmp_code_v4[code]; } @@ -111,7 +112,8 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { int nft_reject_icmpv6_code(u8 code) { - BUG_ON(code > NFT_REJECT_ICMPX_MAX); + if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) + return ICMPV6_NOROUTE; return icmp_code_v6[code]; } diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 76dba9f6b6f6..f35fa33913ae 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr, case NFT_RT_TCPMSS: nft_reg_store16(dest, get_tcpmss(pkt, dst)); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + nft_reg_store8(dest, !!dst->xfrm); + break; +#endif default: WARN_ON(1); goto err; @@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, case NFT_RT_TCPMSS: len = sizeof(u16); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + len = sizeof(u8); + break; +#endif default: return -EOPNOTSUPP; } @@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: case NFT_RT_CLASSID: + case NFT_RT_XFRM: return 0; case NFT_RT_TCPMSS: hooks = (1 << NF_INET_FORWARD) | diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 015124e649cb..339a9dd1c832 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -88,7 +88,7 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) *ext = &he->ext; @@ -106,7 +106,7 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, .key = elem->key.val.data, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) return he; @@ -129,7 +129,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; @@ -217,7 +217,7 @@ static void *nft_rhash_deactivate(const struct net *net, }; rcu_read_lock(); - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL && !nft_rhash_flush(net, set, he)) he = NULL; @@ -244,21 +244,15 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_rhash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; - int err; - - err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC); - iter->err = err; - if (err) - return; + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - err = PTR_ERR(he); - if (err != -EAGAIN) { - iter->err = err; - goto out; + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; } continue; @@ -275,13 +269,11 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) - goto out; + break; cont: iter->count++; } - -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); } @@ -293,21 +285,17 @@ static void nft_rhash_gc(struct work_struct *work) struct nft_rhash *priv; struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; - int err; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); - if (err) - goto schedule; - + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { if (PTR_ERR(he) != -EAGAIN) - goto out; + break; continue; } @@ -326,17 +314,15 @@ gc: gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (gcb == NULL) - goto out; + break; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, he); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); nft_set_gc_batch_complete(gcb); -schedule: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 55e2d9215c0d..fa61208371f8 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -135,9 +135,12 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, d = memcmp(this, key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); - interval = rbe; + if (!(flags & NFT_SET_ELEM_INTERVAL_END)) + interval = rbe; } else if (d > 0) { parent = rcu_dereference_raw(parent->rb_right); + if (flags & NFT_SET_ELEM_INTERVAL_END) + interval = rbe; } else { if (!nft_set_elem_active(&rbe->ext, genmask)) parent = rcu_dereference_raw(parent->rb_left); @@ -154,7 +157,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_interval_end(interval)) { + ((!nft_rbtree_interval_end(interval) && + !(flags & NFT_SET_ELEM_INTERVAL_END)) || + (nft_rbtree_interval_end(interval) && + (flags & NFT_SET_ELEM_INTERVAL_END)))) { *elem = interval; return true; } @@ -355,12 +361,11 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { + struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; struct nft_set_gc_batch *gcb = NULL; - struct rb_node *node, *prev = NULL; - struct nft_rbtree_elem *rbe; struct nft_rbtree *priv; + struct rb_node *node; struct nft_set *set; - int i; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); @@ -371,7 +376,7 @@ static void nft_rbtree_gc(struct work_struct *work) rbe = rb_entry(node, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe)) { - prev = node; + rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) @@ -379,29 +384,30 @@ static void nft_rbtree_gc(struct work_struct *work) if (nft_set_elem_mark_busy(&rbe->ext)) continue; + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + rbe_prev = NULL; + } gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (!gcb) break; atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); + rbe_prev = rbe; - if (prev) { - rbe = rb_entry(prev, struct nft_rbtree_elem, node); + if (rbe_end) { atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - prev = NULL; + nft_set_gc_batch_add(gcb, rbe_end); + rb_erase(&rbe_end->node, &priv->root); + rbe_end = NULL; } node = rb_next(node); if (!node) break; } - if (gcb) { - for (i = 0; i < gcb->head.cnt; i++) { - rbe = gcb->elems[i]; - rb_erase(&rbe->node, &priv->root); - } - } + if (rbe_prev) + rb_erase(&rbe_prev->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c new file mode 100644 index 000000000000..5322609f7662 --- /dev/null +++ b/net/netfilter/nft_xfrm.c @@ -0,0 +1,294 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Generic part shared by ipv4 and ipv6 backends. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { + [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_DIR] = { .type = NLA_U8 }, + [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_DREG] = { .type = NLA_U32 }, +}; + +struct nft_xfrm { + enum nft_xfrm_keys key:8; + enum nft_registers dreg:8; + u8 dir; + u8 spnum; +}; + +static int nft_xfrm_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int len = 0; + u32 spnum = 0; + u8 dir; + + if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) + return -EINVAL; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY])); + switch (priv->key) { + case NFT_XFRM_KEY_REQID: + case NFT_XFRM_KEY_SPI: + len = sizeof(u32); + break; + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + len = sizeof(struct in_addr); + break; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + len = sizeof(struct in6_addr); + break; + default: + return -EINVAL; + } + + dir = nla_get_u8(tb[NFTA_XFRM_DIR]); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + priv->dir = dir; + break; + default: + return -EINVAL; + } + + if (tb[NFTA_XFRM_SPNUM]) + spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); + + if (spnum >= XFRM_MAX_DEPTH) + return -ERANGE; + + priv->spnum = spnum; + + priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +/* Return true if key asks for daddr/saddr and current + * state does have a valid address (BEET, TUNNEL). + */ +static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) +{ + switch (k) { + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + if (family == NFPROTO_IPV4) + break; + return false; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + if (family == NFPROTO_IPV6) + break; + return false; + default: + return true; + } + + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; +} + +static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct xfrm_state *state) +{ + u32 *dest = ®s->data[priv->dreg]; + + if (!xfrm_state_addr_ok(priv->key, + state->props.family, + state->props.mode)) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_XFRM_KEY_UNSPEC: + case __NFT_XFRM_KEY_MAX: + WARN_ON_ONCE(1); + break; + case NFT_XFRM_KEY_DADDR_IP4: + *dest = state->id.daddr.a4; + return; + case NFT_XFRM_KEY_DADDR_IP6: + memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_SADDR_IP4: + *dest = state->props.saddr.a4; + return; + case NFT_XFRM_KEY_SADDR_IP6: + memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_REQID: + *dest = state->props.reqid; + return; + case NFT_XFRM_KEY_SPI: + *dest = state->id.spi; + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct sec_path *sp = pkt->skb->sp; + const struct xfrm_state *state; + + if (sp == NULL || sp->len <= priv->spnum) { + regs->verdict.code = NFT_BREAK; + return; + } + + state = sp->xvec[priv->spnum]; + nft_xfrm_state_get_key(priv, regs, state); +} + +static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct dst_entry *dst = skb_dst(pkt->skb); + int i; + + for (i = 0; dst && dst->xfrm; + dst = ((const struct xfrm_dst *)dst)->child, i++) { + if (i < priv->spnum) + continue; + + nft_xfrm_state_get_key(priv, regs, dst->xfrm); + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + switch (priv->dir) { + case XFRM_POLICY_IN: + nft_xfrm_get_eval_in(priv, regs, pkt); + break; + case XFRM_POLICY_OUT: + nft_xfrm_get_eval_out(priv, regs, pkt); + break; + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} + +static int nft_xfrm_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) + return -1; + + if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) + return -1; + if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) + return -1; + if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) + return -1; + + return 0; +} + +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->dir) { + case XFRM_POLICY_IN: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING); + break; + case XFRM_POLICY_OUT: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING); + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + + +static struct nft_expr_type nft_xfrm_type; +static const struct nft_expr_ops nft_xfrm_get_ops = { + .type = &nft_xfrm_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), + .eval = nft_xfrm_get_eval, + .init = nft_xfrm_get_init, + .dump = nft_xfrm_get_dump, + .validate = nft_xfrm_validate, +}; + +static struct nft_expr_type nft_xfrm_type __read_mostly = { + .name = "xfrm", + .ops = &nft_xfrm_get_ops, + .policy = nft_xfrm_policy, + .maxattr = NFTA_XFRM_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_xfrm_module_init(void) +{ + return nft_register_expr(&nft_xfrm_type); +} + +static void __exit nft_xfrm_module_exit(void) +{ + nft_unregister_expr(&nft_xfrm_type); +} + +module_init(nft_xfrm_module_init); +module_exit(nft_xfrm_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Máté Eckl "); +MODULE_ALIAS_NFT_EXPR("xfrm"); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 89457efd2e00..2c7a4b80206f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, /* Make sure the timeout policy matches any existing protocol tracker, * otherwise default to generic. */ - l4proto = __nf_ct_l4proto_find(par->family, proto); + l4proto = __nf_ct_l4proto_find(proto); if (timeout->l4proto->l4proto != l4proto->l4proto) { ret = -EINVAL; pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 5ee859193783..eb4cbd244c3d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -68,8 +68,6 @@ struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; - BUG_ON(!label); - list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; @@ -116,6 +114,22 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static int idletimer_check_sysfs_name(const char *name, unsigned int size) +{ + int ret; + + ret = xt_check_proc_name(name, size); + if (ret < 0) + return ret; + + if (!strcmp(name, "power") || + !strcmp(name, "subsystem") || + !strcmp(name, "uevent")) + return -EINVAL; + + return 0; +} + static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; @@ -126,6 +140,10 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { @@ -172,8 +190,6 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); - BUG_ON(!info->timer); - mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 4ad5fe27e08b..f16202d26c20 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -35,8 +35,6 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) u32 secmark = 0; const struct xt_secmark_target_info *info = par->targinfo; - BUG_ON(info->mode != mode); - switch (mode) { case SECMARK_MODE_SEL: secmark = info->secid; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 0d0d68c989df..1dae02a97ee3 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -25,8 +27,15 @@ struct xt_tee_priv { int oif; }; +static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; +struct tee_net { + struct list_head priv_list; + /* lock protects the priv_list */ + struct mutex lock; +}; + static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { @@ -51,17 +60,16 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif -static DEFINE_MUTEX(priv_list_mutex); -static LIST_HEAD(priv_list); - static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; - mutex_lock(&priv_list_mutex); - list_for_each_entry(priv, &priv_list, list) { + mutex_lock(&tn->lock); + list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) @@ -79,13 +87,14 @@ static int tee_netdev_event(struct notifier_block *this, unsigned long event, break; } } - mutex_unlock(&priv_list_mutex); + mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { + struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; @@ -95,6 +104,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->oif[0]) { + struct net_device *dev; + if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; @@ -106,9 +117,14 @@ static int tee_tg_check(const struct xt_tgchk_param *par) priv->oif = -1; info->priv = priv; - mutex_lock(&priv_list_mutex); - list_add(&priv->list, &priv_list); - mutex_unlock(&priv_list_mutex); + dev = dev_get_by_name(par->net, info->oif); + if (dev) { + priv->oif = dev->ifindex; + dev_put(dev); + } + mutex_lock(&tn->lock); + list_add(&priv->list, &tn->priv_list); + mutex_unlock(&tn->lock); } else info->priv = NULL; @@ -118,12 +134,13 @@ static int tee_tg_check(const struct xt_tgchk_param *par) static void tee_tg_destroy(const struct xt_tgdtor_param *par) { + struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { - mutex_lock(&priv_list_mutex); + mutex_lock(&tn->lock); list_del(&info->priv->list); - mutex_unlock(&priv_list_mutex); + mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); @@ -156,6 +173,21 @@ static struct xt_target tee_tg_reg[] __read_mostly = { #endif }; +static int __net_init tee_net_init(struct net *net) +{ + struct tee_net *tn = net_generic(net, tee_net_id); + + INIT_LIST_HEAD(&tn->priv_list); + mutex_init(&tn->lock); + return 0; +} + +static struct pernet_operations tee_net_ops = { + .init = tee_net_init, + .id = &tee_net_id, + .size = sizeof(struct tee_net), +}; + static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; @@ -164,22 +196,32 @@ static int __init tee_tg_init(void) { int ret; - ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); - if (ret) + ret = register_pernet_subsys(&tee_net_ops); + if (ret < 0) return ret; + + ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + if (ret < 0) + goto cleanup_subsys; + ret = register_netdevice_notifier(&tee_netdev_notifier); - if (ret) { - xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); - return ret; - } + if (ret < 0) + goto unregister_targets; return 0; + +unregister_targets: + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +cleanup_subsys: + unregister_pernet_subsys(&tee_net_ops); + return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 5d92e1781980..5cb1ecb29ea4 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -68,6 +68,38 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return 0; } +static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + struct cgroup *cgrp; + + if ((info->invert_path & ~1) || (info->invert_classid & ~1)) + return -EINVAL; + + if (!info->has_path && !info->has_classid) { + pr_info("xt_cgroup: no path or classid specified\n"); + return -EINVAL; + } + + if (info->has_path && info->has_classid) { + pr_info_ratelimited("path and classid specified\n"); + return -EINVAL; + } + + info->priv = NULL; + if (info->has_path) { + cgrp = cgroup_get_from_path(info->path); + if (IS_ERR(cgrp)) { + pr_info_ratelimited("invalid path, errno=%ld\n", + PTR_ERR(cgrp)); + return -EINVAL; + } + info->priv = cgrp; + } + + return 0; +} + static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { @@ -99,6 +131,24 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) info->invert_classid; } +static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info_v2 *info = par->matchinfo; + struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; + struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; + + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) + return false; + + if (ancestor) + return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ + info->invert_path; + else + return (info->classid == sock_cgroup_classid(skcd)) ^ + info->invert_classid; +} + static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; @@ -107,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) cgroup_put(info->priv); } +static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + + if (info->priv) + cgroup_put(info->priv); +} + static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", @@ -134,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = { (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, + { + .name = "cgroup", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v2, + .match = cgroup_mt_v2, + .matchsize = sizeof(struct xt_cgroup_info_v2), + .usersize = offsetof(struct xt_cgroup_info_v2, priv), + .destroy = cgroup_mt_destroy_v2, + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index 8af9707f8789..ac91170fc8c8 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -216,6 +216,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "DNAT", .revision = 2, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, .target = xt_dnat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index bf7bba80e24c..7a103553d10d 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -40,14 +40,8 @@ static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { - const struct xt_osf_info *info = p->matchinfo; - struct net *net = xt_net(p); - - if (!info) - return false; - return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), - xt_out(p), info, net, nf_osf_fingers); + xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers); } static struct xt_match xt_osf_match = { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 0472f3472842..ada144e5645b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 930d17fa906c..6bb9f3cde0b0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid) if (nlk_sk(sk)->bound) goto err; - err = -ENOMEM; - if (BITS_PER_LONG > 32 && - unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) - goto err; - nlk_sk(sk)->portid = portid; sock_hold(sk); @@ -993,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; - long unsigned int groups = nladdr->nl_groups; + unsigned long groups = nladdr->nl_groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) @@ -1011,9 +1006,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->ngroups == 0) - groups = 0; - else if (nlk->ngroups < 8*sizeof(groups)) + if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; bound = nlk->bound; @@ -1713,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_EXT_ACK; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (val) + nlk->flags |= NETLINK_F_STRICT_CHK; + else + nlk->flags &= ~NETLINK_F_STRICT_CHK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1806,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; + if (put_user(len, optlen) || put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2178,6 +2187,7 @@ EXPORT_SYMBOL(__nlmsg_put); static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; @@ -2229,8 +2239,11 @@ static int netlink_dump(struct sock *sk) skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); - if (nlk->dump_done_errno > 0) + if (nlk->dump_done_errno > 0) { + cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); + cb->extack = NULL; + } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { @@ -2244,7 +2257,8 @@ static int netlink_dump(struct sock *sk) } nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, - sizeof(nlk->dump_done_errno), NLM_F_MULTI); + sizeof(nlk->dump_done_errno), + NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) goto errout_skb; @@ -2253,6 +2267,12 @@ static int netlink_dump(struct sock *sk) memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); + if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) { + nlh->nlmsg_flags |= NLM_F_ACK_TLVS; + if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg)) + nlmsg_end(skb, nlh); + } + if (sk_filter(sk, skb)) kfree_skb(skb); else @@ -2279,9 +2299,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { + struct netlink_sock *nlk, *nlk2; struct netlink_callback *cb; struct sock *sk; - struct netlink_sock *nlk; int ret; refcount_inc(&skb->users); @@ -2315,6 +2335,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + nlk2 = nlk_sk(NETLINK_CB(skb).sk); + cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); + if (control->start) { ret = control->start(cb); if (ret) diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 962de7b3c023..5f454c8de6a4 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -15,6 +15,7 @@ #define NETLINK_F_LISTEN_ALL_NSID 0x10 #define NETLINK_F_CAP_ACK 0x20 #define NETLINK_F_EXT_ACK 0x40 +#define NETLINK_F_STRICT_CHK 0x80 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index dd4adf8b1167..ae296273ce3d 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, pr_debug("%p\n", sk); - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index a66f102c6c01..78fe622eba65 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -192,10 +192,8 @@ static void nci_uart_tty_close(struct tty_struct *tty) if (!nu) return; - if (nu->tx_skb) - kfree_skb(nu->tx_skb); - if (nu->rx_skb) - kfree_skb(nu->rx_skb); + kfree_skb(nu->tx_skb); + kfree_skb(nu->rx_skb); skb_queue_purge(&nu->tx_q); @@ -465,6 +463,7 @@ static struct tty_ldisc_ops nci_uart_ldisc = { .receive_buf = nci_uart_tty_receive, .write_wakeup = nci_uart_tty_wakeup, .ioctl = nci_uart_tty_ioctl, + .compat_ioctl = nci_uart_tty_ioctl, }; static int __init nci_uart_init(void) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 86a75105af1a..a4660c48ff01 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -933,6 +933,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, struct nf_conn *ct; if (!cached) { + struct nf_hook_state state = { + .hook = NF_INET_PRE_ROUTING, + .pf = info->family, + .net = net, + }; struct nf_conn *tmpl = info->ct; int err; @@ -944,8 +949,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, nf_ct_set(skb, tmpl, IP_CT_NEW); } - err = nf_conntrack_in(net, info->family, - NF_INET_PRE_ROUTING, skb); + err = nf_conntrack_in(skb, &state); if (err != NF_ACCEPT) return -ENOENT; @@ -1199,7 +1203,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &info->labels.mask); if (err) return err; - } else if (labels_nonzero(&info->labels.mask)) { + } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) @@ -1312,6 +1317,10 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, rcu_assign_pointer(help->helper, helper); info->helper = helper; + + if (info->nat) + request_module("ip_nat_%s", name); + return 0; } @@ -1624,10 +1633,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } - - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); - if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1639,6 +1644,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0f5ce77460d4..6679e96ab1dc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1182,14 +1182,14 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, - OVS_FLOW_CMD_NEW, + OVS_FLOW_CMD_SET, ufid_flags); BUG_ON(error < 0); } } else { /* Could not alloc without acts before locking. */ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, - info, OVS_FLOW_CMD_NEW, false, + info, OVS_FLOW_CMD_SET, false, ufid_flags); if (IS_ERR(reply)) { @@ -1265,7 +1265,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, - OVS_FLOW_CMD_NEW, true, ufid_flags); + OVS_FLOW_CMD_GET, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; @@ -1389,7 +1389,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_FLOW_CMD_NEW, ufid_flags) < 0) + OVS_FLOW_CMD_GET, ufid_flags) < 0) break; cb->args[0] = bucket; @@ -1730,7 +1730,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_dp_change(dp, info->attrs); err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_NEW); + info->snd_seq, 0, OVS_DP_CMD_SET); BUG_ON(err < 0); ovs_unlock(); @@ -1761,7 +1761,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) goto err_unlock_free; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_NEW); + info->snd_seq, 0, OVS_DP_CMD_GET); BUG_ON(err < 0); ovs_unlock(); @@ -1785,7 +1785,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_DP_CMD_NEW) < 0) + OVS_DP_CMD_GET) < 0) break; i++; } @@ -2101,7 +2101,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW); + OVS_VPORT_CMD_SET); BUG_ON(err < 0); ovs_unlock(); @@ -2182,7 +2182,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW); + OVS_VPORT_CMD_GET); BUG_ON(err < 0); rcu_read_unlock(); @@ -2218,7 +2218,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_VPORT_CMD_NEW) < 0) + OVS_VPORT_CMD_GET) < 0) goto out; j++; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 56b8e7167790..35966da84769 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -254,21 +254,18 @@ static bool icmphdr_ok(struct sk_buff *skb) static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { + unsigned short frag_off; + unsigned int payload_ofs = 0; unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; - int payload_ofs; struct ipv6hdr *nh; - uint8_t nexthdr; - __be16 frag_off; - int err; + int err, nexthdr, flags = 0; err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; nh = ipv6_hdr(skb); - nexthdr = nh->nexthdr; - payload_ofs = (u8 *)(nh + 1) - skb->data; key->ip.proto = NEXTHDR_NONE; key->ip.tos = ipv6_get_dsfield(nh); @@ -277,10 +274,9 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.src = nh->saddr; key->ipv6.addr.dst = nh->daddr; - payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - - if (frag_off) { - if (frag_off & htons(~0x7)) + nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); + if (flags & IP6_FH_F_FRAG) { + if (frag_off) key->ip.frag = OVS_FRAG_TYPE_LATER; else key->ip.frag = OVS_FRAG_TYPE_FIRST; @@ -288,11 +284,11 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } - /* Delayed handling of error in ipv6_skip_exthdr() as it - * always sets frag_off to a valid value which may be + /* Delayed handling of error in ipv6_find_hdr() as it + * always sets flags and frag_off to a valid value which may be * used to set key->ip.frag above. */ - if (unlikely(payload_ofs < 0)) + if (unlikely(nexthdr < 0)) return -EPROTO; nh_len = payload_ofs - nh_ofs; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a70097ecf33c..865ecef68196 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3030,7 +3030,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, * is already present */ if (mac_proto != MAC_PROTO_NONE) return -EINVAL; - mac_proto = MAC_PROTO_NONE; + mac_proto = MAC_PROTO_ETHERNET; break; case OVS_ACTION_ATTR_POP_ETH: @@ -3038,7 +3038,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; if (vlan_tci & htons(VLAN_TAG_PRESENT)) return -EINVAL; - mac_proto = MAC_PROTO_ETHERNET; + mac_proto = MAC_PROTO_NONE; break; case OVS_ACTION_ATTR_PUSH_NSH: diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index bb95c43aae76..26f71cbf7527 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -43,7 +43,8 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) } /* Called with rcu_read_lock_bh. */ -static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +static netdev_tx_t +internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; @@ -62,7 +63,7 @@ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) } else { netdev->stats.tx_errors++; } - return 0; + return NETDEV_TX_OK; } static int internal_dev_open(struct net_device *netdev) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..ec3095f13aae 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2715,10 +2715,12 @@ tpacket_error: } } - if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr, - vio_le())) { - tp_len = -EINVAL; - goto tpacket_error; + if (po->has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + tp_len = -EINVAL; + goto tpacket_error; + } + virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; @@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; len += sizeof(vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb_probe_transport_header(skb, reserve); @@ -3805,6 +3808,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return fanout_set_data(po, optval, optlen); } + case PACKET_IGNORE_OUTGOING: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + if (val < 0 || val > 1) + return -EINVAL; + + po->prot_hook.ignore_outgoing = !!val; + return 0; + } case PACKET_TX_HAS_OFF: { unsigned int val; @@ -3928,6 +3945,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_IGNORE_OUTGOING: + val = po->prot_hook.ignore_outgoing; + break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; diff --git a/net/rds/rds.h b/net/rds/rds.h index c4dcf654d8fe..6bfaf05b63b2 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -278,7 +278,7 @@ struct rds_incoming { struct in6_addr i_saddr; rds_rdma_cookie_t i_rdma_cookie; - struct timeval i_rx_tstamp; + ktime_t i_rx_tstamp; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; diff --git a/net/rds/recv.c b/net/rds/recv.c index 504cd6bcc54c..727639dac8a7 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,18 +43,14 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr) { - int i; - refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; - inc->i_rx_tstamp.tv_sec = 0; - inc->i_rx_tstamp.tv_usec = 0; + inc->i_rx_tstamp = ktime_set(0, 0); - for (i = 0; i < RDS_RX_MAX_TRACES; i++) - inc->i_rx_lat_trace[i] = 0; + memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -67,8 +63,7 @@ void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, inc->i_conn_path = cp; inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; - inc->i_rx_tstamp.tv_sec = 0; - inc->i_rx_tstamp.tv_usec = 0; + inc->i_rx_tstamp = ktime_set(0, 0); } EXPORT_SYMBOL_GPL(rds_inc_path_init); @@ -385,7 +380,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) - do_gettimeofday(&inc->i_rx_tstamp); + inc->i_rx_tstamp = ktime_get_real(); rds_inc_addref(inc); inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); @@ -552,11 +547,11 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, goto out; } - if ((inc->i_rx_tstamp.tv_sec != 0) && + if ((inc->i_rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { + struct timeval tv = ktime_to_timeval(inc->i_rx_tstamp); ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, - sizeof(struct timeval), - &inc->i_rx_tstamp); + sizeof(tv), &tv); if (ret) goto out; } diff --git a/net/rds/send.c b/net/rds/send.c index 57b3d5a8b2db..fe785ee819dd 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1007,7 +1007,8 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } -static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) +static int rds_send_mprds_hash(struct rds_sock *rs, + struct rds_connection *conn, int nonblock) { int hash; @@ -1023,10 +1024,16 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) * used. But if we are interrupted, we have to use the zero * c_path in case the connection ends up being non-MP capable. */ - if (conn->c_npaths == 0) + if (conn->c_npaths == 0) { + /* Cannot wait for the connection be made, so just use + * the base c_path. + */ + if (nonblock) + return 0; if (wait_event_interruptible(conn->c_hs_waitq, conn->c_npaths != 0)) hash = 0; + } if (conn->c_npaths == 1) hash = 0; } @@ -1256,7 +1263,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } if (conn->c_trans->t_mp_capable) - cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)]; + cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; else cpath = &conn->c_path[0]; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 1355f5ca8d22..abca57040f37 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -510,8 +510,8 @@ void rfkill_remove_epo_lock(void) /** * rfkill_is_epo_lock_active - returns true EPO is active * - * Returns 0 (false) if there is NOT an active EPO contidion, - * and 1 (true) if there is an active EPO contition, which + * Returns 0 (false) if there is NOT an active EPO condition, + * and 1 (true) if there is an active EPO condition, which * locks all radios in one of the BLOCKED states. * * Can be called in atomic context. diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ac44d8afffb1..64362d078da8 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -97,7 +97,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, srx->transport_len > len) return -EINVAL; - if (srx->transport.family != rx->family) + if (srx->transport.family != rx->family && + srx->transport.family == AF_INET && rx->family != AF_INET6) return -EAFNOSUPPORT; switch (srx->transport.family) { @@ -384,6 +385,20 @@ u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call) } EXPORT_SYMBOL(rxrpc_kernel_check_life); +/** + * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. + * @sock: The socket the call is on + * @call: The call to query + * + * Allow a kernel service to retrieve the epoch value from a service call to + * see if the client at the other end rebooted. + */ +u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) +{ + return call->conn->proto.epoch; +} +EXPORT_SYMBOL(rxrpc_kernel_get_epoch); + /** * rxrpc_kernel_check_call - Check a call's state * @sock: The socket the call is on @@ -741,7 +756,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c97558710421..bc628acf4f4f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -40,17 +40,12 @@ struct rxrpc_crypt { struct rxrpc_connection; /* - * Mark applied to socket buffers. + * Mark applied to socket buffers in skb->mark. skb->priority is used + * to pass supplementary information. */ enum rxrpc_skb_mark { - RXRPC_SKB_MARK_DATA, /* data message */ - RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ - RXRPC_SKB_MARK_BUSY, /* server busy message */ - RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ - RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ - RXRPC_SKB_MARK_NET_ERROR, /* network error message */ - RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ - RXRPC_SKB_MARK_NEW_CALL, /* local error message */ + RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ + RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* @@ -293,7 +288,6 @@ struct rxrpc_peer { struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ - struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ @@ -304,12 +298,11 @@ struct rxrpc_peer { unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int error_report; /* Net (+0) or local (+1000000) to distribute */ -#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 + spinlock_t rtt_input_lock; /* RTT lock for input routine */ ktime_t rtt_last_req; /* Time of last RTT request */ u64 rtt; /* Current RTT estimate (in nS) */ u64 rtt_sum; /* Sum of cache contents */ @@ -442,7 +435,7 @@ struct rxrpc_connection { struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ struct key *server_key; /* security for this service */ - struct crypto_skcipher *cipher; /* encryption handle */ + struct crypto_sync_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long flags; unsigned long events; @@ -450,19 +443,29 @@ struct rxrpc_connection { spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_cache_state cache_state; enum rxrpc_conn_proto_state state; /* current state of connection */ - u32 local_abort; /* local abort code */ - u32 remote_abort; /* remote abort code */ + u32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 security_nonce; /* response re-use preventer */ - u16 service_id; /* Service ID, possibly upgraded */ + u32 service_id; /* Service ID, possibly upgraded */ u8 size_align; /* data size alignment (for security) */ u8 security_size; /* security header size */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ + short error; /* Local error code */ }; +static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) +{ + return sp->hdr.flags & RXRPC_CLIENT_INITIATED; +} + +static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) +{ + return !rxrpc_to_server(sp); +} + /* * Flags in call->flags. */ @@ -608,6 +611,7 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + u16 tx_backoff; /* Delay to insert due to Tx failure */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA @@ -633,6 +637,8 @@ struct rxrpc_call { bool tx_phase; /* T if transmission phase, F if receive phase */ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ + spinlock_t input_lock; /* Lock for packet input to this call */ + /* receive-phase ACK management */ u8 ackr_reason; /* reason to ACK */ u16 ackr_skew; /* skew on packet being ACK'd */ @@ -717,7 +723,7 @@ extern struct workqueue_struct *rxrpc_workqueue; int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, - struct rxrpc_connection *, + struct rxrpc_sock *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, @@ -887,8 +893,9 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); -int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, - struct sockaddr_rxrpc *, gfp_t); +int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, + struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, + gfp_t); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_call *); void rxrpc_put_client_conn(struct rxrpc_connection *); @@ -908,7 +915,8 @@ extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, - struct sk_buff *); + struct sk_buff *, + struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); @@ -960,7 +968,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ -void rxrpc_data_ready(struct sock *); +int rxrpc_input_packet(struct sock *, struct sk_buff *); /* * insecure.c @@ -1031,7 +1039,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); void rxrpc_peer_keepalive_worker(struct work_struct *); @@ -1041,22 +1048,22 @@ void rxrpc_peer_keepalive_worker(struct work_struct *); */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *, + struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); -void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; +extern const struct seq_operations rxrpc_peer_seq_ops; /* * recvmsg.c @@ -1093,7 +1100,6 @@ void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); -void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* @@ -1110,8 +1116,7 @@ static inline void rxrpc_sysctl_exit(void) {} /* * utils.c */ -int rxrpc_extract_addr_from_skb(struct rxrpc_local *, struct sockaddr_rxrpc *, - struct sk_buff *); +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); static inline bool before(u32 seq1, u32 seq2) { diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9d1e298b784c..44860505246d 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -249,11 +249,11 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; - struct rxrpc_peer *peer, *xpeer; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; @@ -276,21 +276,18 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, return NULL; if (!conn) { - /* No connection. We're going to need a peer to start off - * with. If one doesn't yet exist, use a spare from the - * preallocation set. We dump the address into the spare in - * anticipation - and to save on stack space. - */ - xpeer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(local, &xpeer->srx, skb) < 0) - return NULL; - - peer = rxrpc_lookup_incoming_peer(local, xpeer); - if (peer == xpeer) { + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0) + return NULL; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + + rxrpc_new_incoming_peer(rx, local, peer); } /* Now allocate and set up the connection */ @@ -335,45 +332,38 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, - struct rxrpc_connection *conn, + struct rxrpc_sock *rx, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_sock *rx; + struct rxrpc_connection *conn; + struct rxrpc_peer *peer = NULL; struct rxrpc_call *call; - u16 service_id = sp->hdr.serviceId; _enter(""); - /* Get the socket providing the service */ - rx = rcu_dereference(local->service); - if (rx && (service_id == rx->srx.srx_service || - service_id == rx->second_service)) - goto found_service; - - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; - skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [service]"); - return NULL; - -found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [close]"); call = NULL; goto out; } - call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + /* The peer, connection and call may all have sprung into existence due + * to a duplicate packet being handled on another CPU in parallel, so + * we have to recheck the routing. However, we're now holding + * rx->incoming_lock, so the values should remain stable. + */ + conn = rxrpc_find_connection_rcu(local, skb, &peer); + + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { - skb->mark = RXRPC_SKB_MARK_BUSY; + skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); call = NULL; goto out; @@ -413,20 +403,22 @@ found_service: case RXRPC_CONN_SERVICE: write_lock(&call->state_lock); - if (rx->discard_new_call) - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - else - call->state = RXRPC_CALL_SERVER_ACCEPTING; + if (call->state < RXRPC_CALL_COMPLETE) { + if (rx->discard_new_call) + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + else + call->state = RXRPC_CALL_SERVER_ACCEPTING; + } write_unlock(&call->state_lock); break; case RXRPC_CONN_REMOTELY_ABORTED: rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - conn->remote_abort, -ECONNABORTED); + conn->abort_code, conn->error); break; case RXRPC_CONN_LOCALLY_ABORTED: rxrpc_abort_call("CON", call, sp->hdr.seq, - conn->local_abort, -ECONNABORTED); + conn->abort_code, conn->error); break; default: BUG(); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8e7434e92097..468efc3660c0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; + ack_at += READ_ONCE(call->tx_backoff); ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); @@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work) container_of(work, struct rxrpc_call, processor); rxrpc_serial_t *send_ack; unsigned long now, next, t; + unsigned int iterations = 0; rxrpc_see_call(call); @@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events); recheck_state: + /* Limit the number of times we do this before returning to the manager */ + iterations++; + if (iterations > 5) + goto requeue; + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_send_abort_packet(call); goto recheck_state; @@ -447,13 +454,16 @@ recheck_state: rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ - if (call->events && call->state < RXRPC_CALL_COMPLETE) { - __rxrpc_queue_call(call); - goto out; - } + if (call->events && call->state < RXRPC_CALL_COMPLETE) + goto requeue; out_put: rxrpc_put_call(call, rxrpc_call_put); out: _leave(""); + return; + +requeue: + __rxrpc_queue_call(call); + goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9486293fef5c..8f1a8f85b1f9 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -138,6 +138,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); spin_lock_init(&call->notify_lock); + spin_lock_init(&call->input_lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); call->debug_id = debug_id; @@ -287,7 +288,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ - ret = rxrpc_connect_call(call, cp, srx, gfp); + ret = rxrpc_connect_call(rx, call, cp, srx, gfp); if (ret < 0) goto error; @@ -339,7 +340,7 @@ int rxrpc_retry_client_call(struct rxrpc_sock *rx, /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ - ret = rxrpc_connect_call(call, cp, srx, gfp); + ret = rxrpc_connect_call(rx, call, cp, srx, gfp); if (ret < 0) goto error; @@ -400,7 +401,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); spin_lock(&conn->params.peer->lock); - hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f8f37188a932..521189f4b666 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -276,7 +276,8 @@ dont_reuse: * If we return with a connection, the call will be on its waiting list. It's * left to the caller to assign a channel and wake up the call. */ -static int rxrpc_get_client_conn(struct rxrpc_call *call, +static int rxrpc_get_client_conn(struct rxrpc_sock *rx, + struct rxrpc_call *call, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -289,7 +290,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); + cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp); if (!cp->peer) goto error; @@ -683,7 +684,8 @@ out: * find a connection for a call * - called in process context with IRQs enabled */ -int rxrpc_connect_call(struct rxrpc_call *call, +int rxrpc_connect_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -696,7 +698,7 @@ int rxrpc_connect_call(struct rxrpc_call *call, rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); rxrpc_cull_active_client_conns(rxnet); - ret = rxrpc_get_client_conn(call, cp, srx, gfp); + ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp); if (ret < 0) goto out; @@ -710,8 +712,8 @@ int rxrpc_connect_call(struct rxrpc_call *call, } spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, - &call->conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, + &call->conn->params.peer->error_targets); spin_unlock_bh(&call->conn->params.peer->lock); out: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 6df56ce68861..b6fca8ebb117 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -126,7 +126,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, switch (chan->last_type) { case RXRPC_PACKET_TYPE_ABORT: - _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); + _proto("Tx ABORT %%%u { %d } [re]", serial, conn->abort_code); break; case RXRPC_PACKET_TYPE_ACK: trace_rxrpc_tx_ack(chan->call_debug_id, serial, @@ -153,13 +153,12 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, * pass a connection-level abort onto all calls on that connection */ static void rxrpc_abort_calls(struct rxrpc_connection *conn, - enum rxrpc_call_completion compl, - u32 abort_code, int error) + enum rxrpc_call_completion compl) { struct rxrpc_call *call; int i; - _enter("{%d},%x", conn->debug_id, abort_code); + _enter("{%d},%x", conn->debug_id, conn->abort_code); spin_lock(&conn->channel_lock); @@ -172,9 +171,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, trace_rxrpc_abort(call->debug_id, "CON", call->cid, call->call_id, 0, - abort_code, error); + conn->abort_code, + conn->error); if (rxrpc_set_call_completion(call, compl, - abort_code, error)) + conn->abort_code, + conn->error)) rxrpc_notify_socket(call); } } @@ -207,10 +208,12 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, return 0; } + conn->error = error; + conn->abort_code = abort_code; conn->state = RXRPC_CONN_LOCALLY_ABORTED; spin_unlock_bh(&conn->state_lock); - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error); + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED); msg.msg_name = &conn->params.peer->srx.transport; msg.msg_namelen = conn->params.peer->srx.transport_len; @@ -229,7 +232,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, whdr._rsvd = 0; whdr.serviceId = htons(conn->service_id); - word = htonl(conn->local_abort); + word = htonl(conn->abort_code); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); @@ -240,7 +243,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); + _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -315,9 +318,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); + conn->error = -ECONNABORTED; + conn->abort_code = abort_code; conn->state = RXRPC_CONN_REMOTELY_ABORTED; - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, -ECONNABORTED); + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 77440a356b14..c332722820c2 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -69,10 +69,14 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) * If successful, a pointer to the connection is returned, but no ref is taken. * NULL is returned if there is no match. * + * When searching for a service call, if we find a peer but no connection, we + * return that through *_peer in case we need to create a new service call. + * * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, - struct sk_buff *skb) + struct sk_buff *skb, + struct rxrpc_peer **_peer) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; @@ -82,14 +86,12 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) goto not_found; - k.epoch = sp->hdr.epoch; - k.cid = sp->hdr.cid & RXRPC_CIDMASK; - - /* We may have to handle mixing IPv4 and IPv6 */ - if (srx.transport.family != local->srx.transport.family) { + if (srx.transport.family != local->srx.transport.family && + (srx.transport.family == AF_INET && + local->srx.transport.family != AF_INET6)) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", srx.transport.family, local->srx.transport.family); @@ -99,7 +101,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + if (rxrpc_to_server(sp)) { /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate * step and then the connection from the peer's tree. @@ -107,6 +109,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, peer = rxrpc_lookup_peer_rcu(local, &srx); if (!peer) goto not_found; + *_peer = peer; conn = rxrpc_find_service_conn_rcu(peer, skb); if (!conn || atomic_read(&conn->usage) == 0) goto not_found; @@ -214,7 +217,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_cwnd = call->cong_cwnd; spin_lock_bh(&conn->params.peer->lock); - hlist_del_init(&call->error_link); + hlist_del_rcu(&call->error_link); spin_unlock_bh(&conn->params.peer->lock); if (rxrpc_is_client_call(call)) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdc199c6351..9128aa0e40aa 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -216,10 +216,11 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, /* * Apply a hard ACK by advancing the Tx window. */ -static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, +static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { struct sk_buff *skb, *list = NULL; + bool rot_last = false; int ix; u8 annotation; @@ -243,15 +244,17 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, skb->next = list; list = skb; - if (annotation & RXRPC_TX_ANNO_LAST) + if (annotation & RXRPC_TX_ANNO_LAST) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); + rot_last = true; + } if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK) summary->nr_rot_new_acks++; } spin_unlock(&call->lock); - trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ? + trace_rxrpc_transmit(call, (rot_last ? rxrpc_transmit_rotate_last : rxrpc_transmit_rotate)); wake_up(&call->waitq); @@ -259,9 +262,11 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, while (list) { skb = list; list = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); rxrpc_free_skb(skb, rxrpc_skb_tx_freed); } + + return rot_last; } /* @@ -273,23 +278,26 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, const char *abort_why) { + unsigned int state; ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); write_lock(&call->state_lock); - switch (call->state) { + state = call->state; + switch (state) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: if (reply_begun) - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + call->state = state = RXRPC_CALL_CLIENT_RECV_REPLY; else - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + call->state = state = RXRPC_CALL_CLIENT_AWAIT_REPLY; break; case RXRPC_CALL_SERVER_AWAIT_ACK: __rxrpc_call_completed(call); rxrpc_notify_socket(call); + state = call->state; break; default: @@ -297,11 +305,10 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, } write_unlock(&call->state_lock); - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { + if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY) trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); - } else { + else trace_rxrpc_transmit(call, rxrpc_transmit_end); - } _leave(" = ok"); return true; @@ -332,11 +339,11 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } - if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) - rxrpc_rotate_tx_window(call, top, &summary); if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { - rxrpc_proto_abort("TXL", call, top); - return false; + if (!rxrpc_rotate_tx_window(call, top, &summary)) { + rxrpc_proto_abort("TXL", call, top); + return false; + } } if (!rxrpc_end_tx_phase(call, true, "ETD")) return false; @@ -452,13 +459,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, } } + spin_lock(&call->input_lock); + /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && !rxrpc_receiving_reply(call)) - return; + goto unlock; call->ackr_prev_seq = seq; @@ -488,12 +497,16 @@ next_subpacket: if (flags & RXRPC_LAST_PACKET) { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq != call->rx_top) - return rxrpc_proto_abort("LSN", call, seq); + seq != call->rx_top) { + rxrpc_proto_abort("LSN", call, seq); + goto unlock; + } } else { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - after_eq(seq, call->rx_top)) - return rxrpc_proto_abort("LSA", call, seq); + after_eq(seq, call->rx_top)) { + rxrpc_proto_abort("LSA", call, seq); + goto unlock; + } } trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); @@ -560,8 +573,10 @@ next_subpacket: skip: offset += len; if (flags & RXRPC_JUMBO_PACKET) { - if (skb_copy_bits(skb, offset, &flags, 1) < 0) - return rxrpc_proto_abort("XJF", call, seq); + if (skb_copy_bits(skb, offset, &flags, 1) < 0) { + rxrpc_proto_abort("XJF", call, seq); + goto unlock; + } offset += sizeof(struct rxrpc_jumbo_header); seq++; serial++; @@ -601,6 +616,9 @@ ack: trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); } + +unlock: + spin_unlock(&call->input_lock); _leave(" [queued]"); } @@ -622,13 +640,14 @@ static void rxrpc_input_requested_ack(struct rxrpc_call *call, if (!skb) continue; + sent_at = skb->tstamp; + smp_rmb(); /* Read timestamp before serial. */ sp = rxrpc_skb(skb); if (sp->hdr.serial != orig_serial) continue; - smp_rmb(); - sent_at = skb->tstamp; goto found; } + return; found: @@ -686,15 +705,14 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call, ping_time = call->ping_time; smp_rmb(); - ping_serial = call->ping_serial; + ping_serial = READ_ONCE(call->ping_serial); if (orig_serial == call->acks_lost_ping) rxrpc_input_check_for_lost_ack(call); - if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || - before(orig_serial, ping_serial)) + if (before(orig_serial, ping_serial) || + !test_and_clear_bit(RXRPC_CALL_PINGING, &call->flags)) return; - clear_bit(RXRPC_CALL_PINGING, &call->flags); if (after(orig_serial, ping_serial)) return; @@ -860,15 +878,32 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ack_respond_to_ack); } + /* Discard any out-of-order or duplicate ACKs. */ + if (before_eq(sp->hdr.serial, call->acks_latest)) + return; + + buf.info.rxMTU = 0; ioffset = offset + nr_acks + 3; - if (skb->len >= ioffset + sizeof(buf.info)) { - if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0) - return rxrpc_proto_abort("XAI", call, 0); + if (skb->len >= ioffset + sizeof(buf.info) && + skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0) + return rxrpc_proto_abort("XAI", call, 0); + + spin_lock(&call->input_lock); + + /* Discard any out-of-order or duplicate ACKs. */ + if (before_eq(sp->hdr.serial, call->acks_latest)) + goto out; + call->acks_latest_ts = skb->tstamp; + call->acks_latest = sp->hdr.serial; + + /* Parse rwind and mtu sizes if provided. */ + if (buf.info.rxMTU) rxrpc_input_ackinfo(call, skb, &buf.info); - } - if (first_soft_ack == 0) - return rxrpc_proto_abort("AK0", call, 0); + if (first_soft_ack == 0) { + rxrpc_proto_abort("AK0", call, 0); + goto out; + } /* Ignore ACKs unless we are or have just been transmitting. */ switch (READ_ONCE(call->state)) { @@ -878,39 +913,35 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, case RXRPC_CALL_SERVER_AWAIT_ACK: break; default: - return; - } - - /* Discard any out-of-order or duplicate ACKs. */ - if (before_eq(sp->hdr.serial, call->acks_latest)) { - _debug("discard ACK %d <= %d", - sp->hdr.serial, call->acks_latest); - return; + goto out; } - call->acks_latest_ts = skb->tstamp; - call->acks_latest = sp->hdr.serial; if (before(hard_ack, call->tx_hard_ack) || - after(hard_ack, call->tx_top)) - return rxrpc_proto_abort("AKW", call, 0); - if (nr_acks > call->tx_top - hard_ack) - return rxrpc_proto_abort("AKN", call, 0); + after(hard_ack, call->tx_top)) { + rxrpc_proto_abort("AKW", call, 0); + goto out; + } + if (nr_acks > call->tx_top - hard_ack) { + rxrpc_proto_abort("AKN", call, 0); + goto out; + } - if (after(hard_ack, call->tx_hard_ack)) - rxrpc_rotate_tx_window(call, hard_ack, &summary); + if (after(hard_ack, call->tx_hard_ack)) { + if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { + rxrpc_end_tx_phase(call, false, "ETA"); + goto out; + } + } if (nr_acks > 0) { - if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) - return rxrpc_proto_abort("XSA", call, 0); + if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) { + rxrpc_proto_abort("XSA", call, 0); + goto out; + } rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks, &summary); } - if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { - rxrpc_end_tx_phase(call, false, "ETA"); - return; - } - if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & RXRPC_TX_ANNO_LAST && summary.nr_acks == call->tx_top - hard_ack && @@ -919,7 +950,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, false, true, rxrpc_propose_ack_ping_for_lost_reply); - return rxrpc_congestion_management(call, skb, &summary, acked_serial); + rxrpc_congestion_management(call, skb, &summary, acked_serial); +out: + spin_unlock(&call->input_lock); } /* @@ -932,9 +965,12 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) _proto("Rx ACKALL %%%u", sp->hdr.serial); - rxrpc_rotate_tx_window(call, call->tx_top, &summary); - if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + spin_lock(&call->input_lock); + + if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) rxrpc_end_tx_phase(call, false, "ETL"); + + spin_unlock(&call->input_lock); } /* @@ -1017,18 +1053,19 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, } /* - * Handle a new call on a channel implicitly completing the preceding call on - * that channel. + * Handle a new service call on a channel implicitly completing the preceding + * call on that channel. This does not apply to client conns. * * TODO: If callNumber > call_id + 1, renegotiate security. */ -static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, +static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, struct rxrpc_call *call) { switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); - break; + /* Fall through */ case RXRPC_CALL_COMPLETE: break; default: @@ -1036,11 +1073,13 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } + trace_rxrpc_improper_term(call); break; } - trace_rxrpc_improper_term(call); + spin_lock(&rx->incoming_lock); __rxrpc_disconnect_call(conn, call); + spin_unlock(&rx->incoming_lock); rxrpc_notify_socket(call); } @@ -1119,43 +1158,29 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * The socket is locked by the caller and this prevents the socket from being * shut down and the local endpoint from going away, thus sk_user_data will not * be cleared until this function returns. + * + * Called with the RCU read lock held from the IP layer via UDP. */ -void rxrpc_data_ready(struct sock *udp_sk) +int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) { struct rxrpc_connection *conn; struct rxrpc_channel *chan; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; - struct sk_buff *skb; + struct rxrpc_peer *peer = NULL; + struct rxrpc_sock *rx = NULL; unsigned int channel; - int ret, skew; + int skew = 0; _enter("%p", udp_sk); - ASSERT(!irqs_disabled()); - - skb = skb_recv_udp(udp_sk, 0, 1, &ret); - if (!skb) { - if (ret == -EAGAIN) - return; - _debug("UDP socket error %d", ret); - return; - } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); rxrpc_new_skb(skb, rxrpc_skb_rx_received); - _net("recv skb %p", skb); - - /* we'll probably need to checksum it (didn't call sock_recvmsg) */ - if (skb_checksum_complete(skb)) { - rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); - _leave(" [CSUM failed]"); - return; - } - - __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); + skb_pull(skb, sizeof(struct udphdr)); /* The UDP protocol already released all skb resources; * we are free to add our own data there. @@ -1170,69 +1195,104 @@ void rxrpc_data_ready(struct sock *udp_sk) static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); - return; + rxrpc_free_skb(skb, rxrpc_skb_rx_lost); + return 0; } } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); trace_rxrpc_rx_packet(sp); - _net("Rx RxRPC %s ep=%x call=%x:%x", - sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || - !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { - _proto("Rx Bad Packet Type %u", sp->hdr.type); - goto bad_message; - } - switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + if (rxrpc_to_client(sp)) goto discard; rxrpc_post_packet_to_local(local, skb); goto out; case RXRPC_PACKET_TYPE_BUSY: - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + if (rxrpc_to_server(sp)) goto discard; /* Fall through */ + case RXRPC_PACKET_TYPE_ACK: + case RXRPC_PACKET_TYPE_ACKALL: + if (sp->hdr.callNumber == 0) + goto bad_message; + /* Fall through */ + case RXRPC_PACKET_TYPE_ABORT: + break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0) + if (sp->hdr.callNumber == 0 || + sp->hdr.seq == 0) goto bad_message; if (sp->hdr.flags & RXRPC_JUMBO_PACKET && !rxrpc_validate_jumbo(skb)) goto bad_message; break; + case RXRPC_PACKET_TYPE_CHALLENGE: + if (rxrpc_to_server(sp)) + goto discard; + break; + case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_to_client(sp)) + goto discard; + break; + /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: goto discard; + + default: + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; } - rcu_read_lock(); + if (sp->hdr.serviceId == 0) + goto bad_message; + + if (rxrpc_to_server(sp)) { + /* Weed out packets to services we're not offering. Packets + * that would begin a call are explicitly rejected and the rest + * are just discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service)) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard; + } + } - conn = rxrpc_find_connection_rcu(local, skb); + conn = rxrpc_find_connection_rcu(local, skb, &peer); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; if (sp->hdr.serviceId != conn->service_id) { - if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) || - conn->service_id != conn->params.service_id) + int old_id; + + if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) + goto reupgrade; + old_id = cmpxchg(&conn->service_id, conn->params.service_id, + sp->hdr.serviceId); + + if (old_id != conn->params.service_id && + old_id != sp->hdr.serviceId) goto reupgrade; - conn->service_id = sp->hdr.serviceId; } if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); rxrpc_post_packet_to_conn(conn, skb); - goto out_unlock; + goto out; } /* Note the serial number skew here */ @@ -1251,19 +1311,19 @@ void rxrpc_data_ready(struct sock *udp_sk) /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) - goto discard_unlock; + goto discard; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) - goto discard_unlock; + goto discard; /* For the previous service call, if completed * successfully, we discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) - goto discard_unlock; + goto discard; /* But otherwise we need to retransmit the final packet * from data cached in the connection record. @@ -1274,18 +1334,16 @@ void rxrpc_data_ready(struct sock *udp_sk) sp->hdr.serial, sp->hdr.flags, 0); rxrpc_post_packet_to_conn(conn, skb); - goto out_unlock; + goto out; } call = rcu_dereference(chan->call); if (sp->hdr.callNumber > chan->call_id) { - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) { - rcu_read_unlock(); + if (rxrpc_to_client(sp)) goto reject_packet; - } if (call) - rxrpc_input_implicit_end_call(conn, call); + rxrpc_input_implicit_end_call(rx, conn, call); call = NULL; } @@ -1297,66 +1355,57 @@ void rxrpc_data_ready(struct sock *udp_sk) if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) set_bit(RXRPC_CALL_RX_HEARD, &call->flags); } - } else { - skew = 0; - call = NULL; } if (!call || atomic_read(&call->usage) == 0) { - if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || - sp->hdr.callNumber == 0 || + if (rxrpc_to_client(sp) || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - goto bad_message_unlock; + goto bad_message; if (sp->hdr.seq != 1) - goto discard_unlock; - call = rxrpc_new_incoming_call(local, conn, skb); - if (!call) { - rcu_read_unlock(); + goto discard; + call = rxrpc_new_incoming_call(local, rx, skb); + if (!call) goto reject_packet; - } rxrpc_send_ping(call, skb, skew); mutex_unlock(&call->user_mutex); } rxrpc_input_call_packet(call, skb, skew); - goto discard_unlock; + goto discard; -discard_unlock: - rcu_read_unlock(); discard: rxrpc_free_skb(skb, rxrpc_skb_rx_freed); out: trace_rxrpc_rx_done(0, 0); - return; - -out_unlock: - rcu_read_unlock(); - goto out; + return 0; wrong_security: - rcu_read_unlock(); trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RXKADINCONSISTENCY, EBADMSG); skb->priority = RXKADINCONSISTENCY; goto post_abort; +unsupported_service: + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto post_abort; + reupgrade: - rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); goto protocol_error; -bad_message_unlock: - rcu_read_unlock(); bad_message: trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); protocol_error: skb->priority = RX_PROTOCOL_ERROR; post_abort: - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); _leave(" [badmsg]"); + return 0; } diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 13bd8a4dfac7..927ead43df42 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -39,7 +39,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, _enter(""); - if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) return; msg.msg_name = &srx.transport; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 777c3ed4cfc0..0906e51d3cfb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "ar-internal.h" @@ -108,7 +109,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { - struct sock *sock; + struct sock *usk; int ret, opt; _enter("%p{%d,%d}", @@ -122,6 +123,28 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) return ret; } + /* set the socket up */ + usk = local->socket->sk; + inet_sk(usk)->mc_loop = 0; + + /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ + inet_inc_convert_csum(usk); + + rcu_assign_sk_user_data(usk, local); + + udp_sk(usk)->encap_type = UDP_ENCAP_RXRPC; + udp_sk(usk)->encap_rcv = rxrpc_input_packet; + udp_sk(usk)->encap_destroy = NULL; + udp_sk(usk)->gro_receive = NULL; + udp_sk(usk)->gro_complete = NULL; + + udp_encap_enable(); +#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) + if (local->srx.transport.family == AF_INET6) + udpv6_encap_enable(); +#endif + usk->sk_error_report = rxrpc_error_report; + /* if a local address was supplied then bind it */ if (local->srx.transport_len > sizeof(sa_family_t)) { _debug("bind"); @@ -135,10 +158,10 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } switch (local->srx.transport.family) { - case AF_INET: - /* we want to receive ICMP errors */ + case AF_INET6: + /* we want to receive ICMPv6 errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -146,19 +169,22 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + opt = IPV6_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); goto error; } - break; - case AF_INET6: + /* Fall through and set IPv4 options too otherwise we don't get + * errors from IPv4 packets sent through the IPv6 socket. + */ + + case AF_INET: /* we want to receive ICMP errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -166,24 +192,28 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IPV6_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); goto error; } + + /* We want receive timestamps. */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS, + (char *)&opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } break; default: BUG(); } - /* set the socket up */ - sock = local->socket->sk; - sock->sk_user_data = local; - sock->sk_data_ready = rxrpc_data_ready; - sock->sk_error_report = rxrpc_error_report; _leave(" = 0"); return 0; diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 417d80867c4f..fd7eba8467fa 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -102,6 +102,9 @@ static __net_init int rxrpc_init_net(struct net *net) proc_create_net("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_ops, sizeof(struct seq_net_private)); + proc_create_net("peers", 0444, rxnet->proc_net, + &rxrpc_peer_seq_ops, + sizeof(struct seq_net_private)); return 0; err_proc: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ccf5de160444..736aa9281100 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -34,6 +34,21 @@ struct rxrpc_abort_buffer { static const char rxrpc_keepalive_string[] = ""; +/* + * Increase Tx backoff on transmission failure and clear it on success. + */ +static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) +{ + if (ret < 0) { + u16 tx_backoff = READ_ONCE(call->tx_backoff); + + if (tx_backoff < HZ) + WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + } else { + WRITE_ONCE(call->tx_backoff, 0); + } +} + /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep @@ -124,7 +139,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; - ktime_t now; size_t len, n; int ret; u8 reason; @@ -196,9 +210,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but * asking UDP to send the packet can take a relatively long - * time, so we update the time after, on the assumption that - * the packet transmission is more likely to happen towards the - * end of the kernel_sendmsg() call. + * time. */ call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); @@ -206,9 +218,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - now = ktime_get_real(); - if (ping) - call->ping_time = now; conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -216,6 +225,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); + rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -224,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true, + false, true, rxrpc_propose_ack_retry_tx); } else { spin_lock_bh(&call->lock); @@ -306,7 +316,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); - + rxrpc_tx_backoff(call, ret); rxrpc_put_connection(conn); return ret; @@ -363,8 +373,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. + * + * However, we mustn't request an ACK on the last reply packet of a + * service call, lest OpenAFS incorrectly send us an ACK with some + * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && + if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || + rxrpc_to_server(sp) + ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || @@ -378,11 +394,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, if ((lose++ & 7) == 7) { ret = 0; lost = true; - goto done; } } - _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, + retrans, lost); + if (lost) + goto done; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ @@ -390,6 +408,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto send_fragmentable; down_read(&conn->params.local->defrag_sem); + + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -406,19 +429,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); + rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; done: - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, - retrans, lost); if (ret >= 0) { - ktime_t now = ktime_get_real(); - skb->tstamp = now; - smp_wmb(); - sp->hdr.serial = serial; if (whdr.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = now; + call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_usage > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -444,9 +462,18 @@ done: rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } - } - rxrpc_set_keepalive(call); + rxrpc_set_keepalive(call); + } else { + /* Cancel the call if the initial transmission fails, + * particularly if that's due to network routing issues that + * aren't going away anytime soon. The layer above can arrange + * the retransmission. + */ + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_USER_ABORT, ret); + } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; @@ -457,6 +484,10 @@ send_fragmentable: down_write(&conn->params.local->defrag_sem); + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + switch (conn->params.local->srx.transport.family) { case AF_INET: opt = IP_PMTUDISC_DONT; @@ -501,6 +532,7 @@ send_fragmentable: else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); + rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; @@ -519,7 +551,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) struct kvec iov[2]; size_t size; __be32 code; - int ret; + int ret, ioc; _enter("%d", local->debug_id); @@ -527,7 +559,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local) iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; @@ -535,16 +566,30 @@ void rxrpc_reject_packets(struct rxrpc_local *local) msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); - if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { - msg.msg_namelen = srx.transport_len; - + switch (skb->mark) { + case RXRPC_SKB_MARK_REJECT_BUSY: + whdr.type = RXRPC_PACKET_TYPE_BUSY; + size = sizeof(whdr); + ioc = 1; + break; + case RXRPC_SKB_MARK_REJECT_ABORT: + whdr.type = RXRPC_PACKET_TYPE_ABORT; code = htonl(skb->priority); + size = sizeof(whdr) + sizeof(code); + ioc = 2; + break; + default: + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + continue; + } + + if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { + msg.msg_namelen = srx.transport_len; whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); @@ -554,7 +599,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; - ret = kernel_sendmsg(local->socket, &msg, iov, 2, size); + ret = kernel_sendmsg(local->socket, &msg, + iov, ioc, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 4f9da2f51c69..bc05af89fc38 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -23,6 +23,8 @@ #include "ar-internal.h" static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); +static void rxrpc_distribute_error(struct rxrpc_peer *, int, + enum rxrpc_call_completion); /* * Find the peer associated with an ICMP packet. @@ -45,6 +47,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, */ switch (srx->transport.family) { case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP: @@ -68,20 +72,20 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: - srx->transport.sin6.sin6_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP6: _net("Rx ICMP6"); + srx->transport.sin6.sin6_port = serr->port; memcpy(&srx->transport.sin6.sin6_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in6_addr)); break; case SO_EE_ORIGIN_ICMP: _net("Rx ICMP on v6 sock"); - srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - memcpy(srx->transport.sin6.sin6_addr.s6_addr + 12, + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = serr->port; + memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in_addr)); break; @@ -193,9 +197,8 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_put_peer(peer); - /* The ref we obtained is passed off to the work item */ - __rxrpc_queue_peer_error(peer); _leave(""); } @@ -205,6 +208,7 @@ void rxrpc_error_report(struct sock *sk) static void rxrpc_store_error(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) { + enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; struct sock_extended_err *ee; int err; @@ -255,7 +259,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); - err += RXRPC_LOCAL_ERROR_OFFSET; + compl = RXRPC_CALL_LOCAL_ERROR; break; case SO_EE_ORIGIN_ICMP6: @@ -264,48 +268,23 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; } - peer->error_report = err; + rxrpc_distribute_error(peer, err, compl); } /* - * Distribute an error that occurred on a peer + * Distribute an error that occurred on a peer. */ -void rxrpc_peer_error_distributor(struct work_struct *work) +static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, + enum rxrpc_call_completion compl) { - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; - enum rxrpc_call_completion compl; - int error; - - _enter(""); - - error = READ_ONCE(peer->error_report); - if (error < RXRPC_LOCAL_ERROR_OFFSET) { - compl = RXRPC_CALL_NETWORK_ERROR; - } else { - compl = RXRPC_CALL_LOCAL_ERROR; - error -= RXRPC_LOCAL_ERROR_OFFSET; - } - _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error); - - spin_lock_bh(&peer->lock); - - while (!hlist_empty(&peer->error_targets)) { - call = hlist_entry(peer->error_targets.first, - struct rxrpc_call, error_link); - hlist_del_init(&call->error_link); + hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - - if (rxrpc_set_call_completion(call, compl, 0, -error)) + if (call->state < RXRPC_CALL_COMPLETE && + rxrpc_set_call_completion(call, compl, 0, -error)) rxrpc_notify_socket(call); } - - spin_unlock_bh(&peer->lock); - - rxrpc_put_peer(peer); - _leave(""); } /* @@ -325,6 +304,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, if (rtt < 0) return; + spin_lock(&peer->rtt_input_lock); + /* Replace the oldest datum in the RTT buffer */ sum -= peer->rtt_cache[cursor]; sum += rtt; @@ -336,6 +317,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, peer->rtt_usage = usage; } + spin_unlock(&peer->rtt_input_lock); + /* Now recalculate the average */ if (usage == RXRPC_RTT_CACHE_SIZE) { avg = sum / RXRPC_RTT_CACHE_SIZE; @@ -344,6 +327,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, do_div(avg, usage); } + /* Don't need to update this under lock */ peer->rtt = avg; trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, usage, avg); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1dc7648e3eff..5691b7d266ca 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -124,11 +124,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { - if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { - if (atomic_read(&peer->usage) == 0) - return NULL; + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && + atomic_read(&peer->usage) > 0) return peer; - } } return NULL; @@ -155,8 +153,10 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, * assess the MTU size for the network interface through which this peer is * reached */ -static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) +static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, + struct rxrpc_peer *peer) { + struct net *net = sock_net(&rx->sk); struct dst_entry *dst; struct rtable *rt; struct flowi fl; @@ -171,7 +171,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) switch (peer->srx.transport.family) { case AF_INET: rt = ip_route_output_ports( - &init_net, fl4, NULL, + net, fl4, NULL, peer->srx.transport.sin.sin_addr.s_addr, 0, htons(7000), htons(7001), IPPROTO_UDP, 0, 0); if (IS_ERR(rt)) { @@ -190,7 +190,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) sizeof(struct in6_addr)); fl6->fl6_dport = htons(7001); fl6->fl6_sport = htons(7000); - dst = ip6_route_output(&init_net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); if (dst->error) { _leave(" [route err %d]", dst->error); return; @@ -222,11 +222,10 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) atomic_set(&peer->usage, 1); peer->local = local; INIT_HLIST_HEAD(&peer->error_targets); - INIT_WORK(&peer->error_distributor, - &rxrpc_peer_error_distributor); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); + spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); if (RXRPC_TX_SMSS > 2190) @@ -244,10 +243,11 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) /* * Initialise peer record. */ -static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key) +static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, + unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(peer); + rxrpc_assess_MTU_size(rx, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); @@ -279,7 +279,8 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key) /* * Set up a new peer. */ -static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, +static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, + struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) @@ -291,7 +292,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, peer = rxrpc_alloc_peer(local, gfp); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); - rxrpc_init_peer(peer, hash_key); + rxrpc_init_peer(rx, peer, hash_key); } _leave(" = %p", peer); @@ -299,40 +300,31 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, } /* - * Set up a new incoming peer. The address is prestored in the preallocated - * peer. + * Set up a new incoming peer. There shouldn't be any other matching peers + * since we've already done a search in the list from the non-reentrant context + * (the data_ready handler) that is the only place we can add new peers. */ -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, - struct rxrpc_peer *prealloc) +void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, + struct rxrpc_peer *peer) { - struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; - hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); - prealloc->local = local; - rxrpc_init_peer(prealloc, hash_key); + hash_key = rxrpc_peer_hash_key(local, &peer->srx); + peer->local = local; + rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); - - /* Need to check that we aren't racing with someone else */ - peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - peer = prealloc; - hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); - list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - } - + hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); - return peer; } /* * obtain a remote transport endpoint for the specified address */ -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, + struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; @@ -352,7 +344,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ - candidate = rxrpc_create_peer(local, srx, hash_key, gfp); + candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; @@ -415,21 +407,6 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) return peer; } -/* - * Queue a peer record. This passes the caller's ref to the workqueue. - */ -void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) -{ - const void *here = __builtin_return_address(0); - int n; - - n = atomic_read(&peer->usage); - if (rxrpc_queue_work(&peer->error_distributor)) - trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); - else - rxrpc_put_peer(peer); -} - /* * Discard a peer record. */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 9805e3b85c36..c7d976859d40 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -212,3 +212,129 @@ const struct seq_operations rxrpc_connection_seq_ops = { .stop = rxrpc_connection_seq_stop, .show = rxrpc_connection_seq_show, }; + +/* + * generate a list of extant virtual peers in /proc/net/rxrpc/peers + */ +static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_peer *peer; + time64_t now; + char lbuff[50], rbuff[50]; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Proto Local " + " Remote " + " Use CW MTU LastUse RTT Rc\n" + ); + return 0; + } + + peer = list_entry(v, struct rxrpc_peer, hash_link); + + sprintf(lbuff, "%pISpc", &peer->local->srx.transport); + + sprintf(rbuff, "%pISpc", &peer->srx.transport); + + now = ktime_get_seconds(); + seq_printf(seq, + "UDP %-47.47s %-47.47s %3u" + " %3u %5u %6llus %12llu %2u\n", + lbuff, + rbuff, + atomic_read(&peer->usage), + peer->cong_cwnd, + peer->mtu, + now - peer->last_tx_at, + peer->rtt, + peer->rtt_cursor); + + return 0; +} + +static void *rxrpc_peer_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rcu) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + unsigned int bucket, n; + unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash); + void *p; + + rcu_read_lock(); + + if (*_pos >= UINT_MAX) + return NULL; + + n = *_pos & ((1U << shift) - 1); + bucket = *_pos >> shift; + for (;;) { + if (bucket >= HASH_SIZE(rxnet->peer_hash)) { + *_pos = UINT_MAX; + return NULL; + } + if (n == 0) { + if (bucket == 0) + return SEQ_START_TOKEN; + *_pos += 1; + n++; + } + + p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1); + if (p) + return p; + bucket++; + n = 1; + *_pos = (bucket << shift) | n; + } +} + +static void *rxrpc_peer_seq_next(struct seq_file *seq, void *v, loff_t *_pos) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + unsigned int bucket, n; + unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash); + void *p; + + if (*_pos >= UINT_MAX) + return NULL; + + bucket = *_pos >> shift; + + p = seq_hlist_next_rcu(v, &rxnet->peer_hash[bucket], _pos); + if (p) + return p; + + for (;;) { + bucket++; + n = 1; + *_pos = (bucket << shift) | n; + + if (bucket >= HASH_SIZE(rxnet->peer_hash)) { + *_pos = UINT_MAX; + return NULL; + } + if (n == 0) { + *_pos += 1; + n++; + } + + p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1); + if (p) + return p; + } +} + +static void rxrpc_peer_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + + +const struct seq_operations rxrpc_peer_seq_ops = { + .start = rxrpc_peer_seq_start, + .next = rxrpc_peer_seq_next, + .stop = rxrpc_peer_seq_stop, + .show = rxrpc_peer_seq_show, +}; diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 93da73bf7098..f9cb83c938f3 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -50,7 +50,6 @@ struct rxrpc_wire_header { #define RXRPC_PACKET_TYPE_10 10 /* Ignored */ #define RXRPC_PACKET_TYPE_11 11 /* Ignored */ #define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ -#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ @@ -72,20 +71,6 @@ struct rxrpc_wire_header { } __packed; -#define RXRPC_SUPPORTED_PACKET_TYPES ( \ - (1 << RXRPC_PACKET_TYPE_DATA) | \ - (1 << RXRPC_PACKET_TYPE_ACK) | \ - (1 << RXRPC_PACKET_TYPE_BUSY) | \ - (1 << RXRPC_PACKET_TYPE_ABORT) | \ - (1 << RXRPC_PACKET_TYPE_ACKALL) | \ - (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ - (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ - /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ - (1 << RXRPC_PACKET_TYPE_PARAMS) | \ - (1 << RXRPC_PACKET_TYPE_10) | \ - (1 << RXRPC_PACKET_TYPE_11) | \ - (1 << RXRPC_PACKET_TYPE_VERSION)) - /*****************************************************************************/ /* * jumbo packet secondary header diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 816b19a78809..eaf19ebaa964 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -715,3 +715,46 @@ call_complete: goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); + +/** + * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet + * @sock: The socket that the call exists on + * @call: The call to query + * @_ts: Where to put the timestamp + * + * Retrieve the timestamp from the first DATA packet of the reply if it is + * in the ring. Returns true if successful, false if not. + */ +bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call, + ktime_t *_ts) +{ + struct sk_buff *skb; + rxrpc_seq_t hard_ack, top, seq; + bool success = false; + + mutex_lock(&call->user_mutex); + + if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY) + goto out; + + hard_ack = call->rx_hard_ack; + if (hard_ack != 0) + goto out; + + seq = hard_ack + 1; + top = smp_load_acquire(&call->rx_top); + if (after(seq, top)) + goto out; + + skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK]; + if (!skb) + goto out; + + *_ts = skb_get_ktime(skb); + success = true; + +out: + mutex_unlock(&call->user_mutex); + return success; +} +EXPORT_SYMBOL(rxrpc_kernel_get_reply_time); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index cea16838d588..cbef9ea43dec 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -46,7 +46,7 @@ struct rxkad_level2_hdr { * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE * packets */ -static struct crypto_skcipher *rxkad_ci; +static struct crypto_sync_skcipher *rxkad_ci; static DEFINE_MUTEX(rxkad_ci_mutex); /* @@ -54,7 +54,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex); */ static int rxkad_init_connection_security(struct rxrpc_connection *conn) { - struct crypto_skcipher *ci; + struct crypto_sync_skcipher *ci; struct rxrpc_key_token *token; int ret; @@ -63,14 +63,14 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; - ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); if (IS_ERR(ci)) { _debug("no cipher"); ret = PTR_ERR(ci); goto error; } - if (crypto_skcipher_setkey(ci, token->kad->session_key, + if (crypto_sync_skcipher_setkey(ci, token->kad->session_key, sizeof(token->kad->session_key)) < 0) BUG(); @@ -104,7 +104,7 @@ error: static int rxkad_prime_packet_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; - SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct scatterlist sg; struct rxrpc_crypt iv; __be32 *tmpbuf; @@ -128,7 +128,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) tmpbuf[3] = htonl(conn->security_ix); sg_init_one(&sg, tmpbuf, tmpsize); - skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); @@ -167,7 +167,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, memset(&iv, 0, sizeof(iv)); sg_init_one(&sg, sechdr, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -212,7 +212,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); @@ -250,7 +250,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, void *sechdr) { struct rxrpc_skb_priv *sp; - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg; u32 x, y; @@ -279,7 +279,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -352,7 +352,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); crypto_skcipher_decrypt(req); @@ -450,7 +450,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); @@ -506,7 +506,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, unsigned int offset, unsigned int len, rxrpc_seq_t seq, u16 expected_cksum) { - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg; bool aborted; @@ -529,7 +529,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -755,7 +755,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxkad_key *s2) { - SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[1]; @@ -764,7 +764,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_encrypt(req); @@ -1021,7 +1021,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxrpc_crypt *session_key) { - SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); struct scatterlist sg[1]; struct rxrpc_crypt iv; @@ -1031,7 +1031,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, ASSERT(rxkad_ci != NULL); mutex_lock(&rxkad_ci_mutex); - if (crypto_skcipher_setkey(rxkad_ci, session_key->x, + if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x, sizeof(*session_key)) < 0) BUG(); @@ -1039,7 +1039,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_tfm(req, rxkad_ci); + skcipher_request_set_sync_tfm(req, rxkad_ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_decrypt(req); @@ -1218,7 +1218,7 @@ static void rxkad_clear(struct rxrpc_connection *conn) _enter(""); if (conn->cipher) - crypto_free_skcipher(conn->cipher); + crypto_free_sync_skcipher(conn->cipher); } /* @@ -1228,7 +1228,7 @@ static int rxkad_init(void) { /* pin the cipher we need so that the crypto layer doesn't invoke * keventd to go get it */ - rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + rxkad_ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); return PTR_ERR_OR_ZERO(rxkad_ci); } @@ -1238,7 +1238,7 @@ static int rxkad_init(void) static void rxkad_exit(void) { if (rxkad_ci) - crypto_free_skcipher(rxkad_ci); + crypto_free_sync_skcipher(rxkad_ci); } /* diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index b8985d01876a..913dca65cc65 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -68,21 +68,6 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) } } -/* - * Note the injected loss of a socket buffer. - */ -void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) -{ - const void *here = __builtin_return_address(0); - if (skb) { - int n; - CHECK_SLAB_OKAY(&skb->users); - n = atomic_dec_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); - kfree_skb(skb); - } -} - /* * Clear a queue of socket buffers. */ diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index e801171fa351..ff7af71c4b49 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -17,28 +17,17 @@ /* * Fill out a peer address from a socket buffer containing a packet. */ -int rxrpc_extract_addr_from_skb(struct rxrpc_local *local, - struct sockaddr_rxrpc *srx, - struct sk_buff *skb) +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) { memset(srx, 0, sizeof(*srx)); switch (ntohs(skb->protocol)) { case ETH_P_IP: - if (local->srx.transport.family == AF_INET6) { - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = udp_hdr(skb)->source; - srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - srx->transport.sin6.sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr; - } else { - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.sin.sin_family = AF_INET; - srx->transport.sin.sin_port = udp_hdr(skb)->source; - srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - } + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = udp_hdr(skb)->source; + srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; return 0; #ifdef CONFIG_AF_RXRPC_IPV6 diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e95741388311..1b9afdee5ba9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -194,6 +194,17 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_TAPRIO + tristate "Time Aware Priority (taprio) Scheduler" + help + Say Y here if you want to use the Time Aware Priority (taprio) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_taprio. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index f0403f49edcb..8a40431d7b5c 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o +obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index e12f8ef7baa4..9c1b0729aebf 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -81,6 +81,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, static void free_tcf(struct tc_action *p) { free_percpu(p->cpu_bstats); + free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->act_cookie, NULL); @@ -103,11 +104,11 @@ static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; - if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; @@ -199,7 +200,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct tc_action *p; unsigned long id = 1; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); s_i = cb->args[0]; @@ -234,7 +235,7 @@ done: if (index >= 0) cb->args[0] = index + 1; - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; @@ -246,6 +247,20 @@ nla_put_failure: goto done; } +static int tcf_idr_release_unsafe(struct tc_action *p) +{ + if (atomic_read(&p->tcfa_bindcnt) > 0) + return -EPERM; + + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + idr_remove(&p->idrinfo->action_idr, p->tcfa_index); + tcf_action_cleanup(p); + return ACT_P_DELETED; + } + + return 0; +} + static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops) { @@ -262,15 +277,19 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, id) { - ret = __tcf_idr_release(p, false, true); + ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); n_i++; } else if (ret < 0) { + mutex_unlock(&idrinfo->lock); goto nla_put_failure; } } + mutex_unlock(&idrinfo->lock); + if (nla_put_u32(skb, TCA_FCNT, n_i)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -305,13 +324,13 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); if (p) { *a = p; @@ -326,10 +345,10 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) struct tc_action *p; int ret = 0; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return -ENOENT; } @@ -339,7 +358,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); @@ -350,7 +369,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) ret = -EPERM; } - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return ret; } @@ -372,9 +391,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); if (!p->cpu_bstats) goto err1; + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats_hw) + goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) - goto err2; + goto err3; } spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; @@ -386,15 +408,17 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err3; + goto err4; } p->idrinfo = idrinfo; p->ops = ops; *a = p; return 0; -err3: +err4: free_percpu(p->cpu_qstats); +err3: + free_percpu(p->cpu_bstats_hw); err2: free_percpu(p->cpu_bstats); err1: @@ -407,10 +431,10 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); @@ -420,10 +444,10 @@ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); @@ -441,14 +465,14 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, int ret; again: - spin_lock(&idrinfo->lock); + mutex_lock(&idrinfo->lock); if (*index) { p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); goto again; } @@ -461,7 +485,7 @@ again: } else { *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - *index, GFP_ATOMIC); + *index, GFP_KERNEL); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); @@ -470,12 +494,12 @@ again: *index = 1; *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - UINT_MAX, GFP_ATOMIC); + UINT_MAX, GFP_KERNEL); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); } - spin_unlock(&idrinfo->lock); + mutex_unlock(&idrinfo->lock); return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); @@ -979,6 +1003,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, goto errout; if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || + gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, @@ -1073,12 +1099,14 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (!ops) { /* could happen in batch of actions */ - NL_SET_ERR_MSG(extack, "Specified TC action not found"); + NL_SET_ERR_MSG(extack, "Specified TC action kind not found"); goto err_out; } err = -ENOENT; - if (ops->lookup(net, &a, index, extack) == 0) + if (ops->lookup(net, &a, index) == 0) { + NL_SET_ERR_MSG(extack, "TC action with specified index not found"); goto err_mod; + } module_put(ops->owner); return a; @@ -1424,7 +1452,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) u32 act_count = 0; ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, - tcaa_policy, NULL); + tcaa_policy, cb->extack); if (ret < 0) return ret; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0c68bc9cf0b4..c7633843e223 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -387,8 +387,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, bpf_net_id); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 6f0f273f1139..8475913f2070 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -143,8 +143,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, return -EEXIST; } /* replacing action and zone */ + spin_lock_bh(&ci->tcf_lock); ci->tcf_action = parm->action; ci->zone = parm->zone; + spin_unlock_bh(&ci->tcf_lock); ret = 0; } @@ -156,16 +158,16 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_connmark_info *ci = to_connmark(a); - struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, - .action = ci->tcf_action, - .zone = ci->zone, }; struct tcf_t t; + spin_lock_bh(&ci->tcf_lock); + opt.action = ci->tcf_action; + opt.zone = ci->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -173,9 +175,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; + spin_unlock_bh(&ci->tcf_lock); return skb->len; + nla_put_failure: + spin_unlock_bh(&ci->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -190,8 +195,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b8a67ae3105a..3dc25b7806d7 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -646,8 +646,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, csum_net_id); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index cd1d9bd32ef9..b61c20ebb314 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -88,6 +88,11 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, p_parm = nla_data(tb[TCA_GACT_PROB]); if (p_parm->ptype >= MAX_RAND) return -EINVAL; + if (TC_ACT_EXT_CMP(p_parm->paction, TC_ACT_GOTO_CHAIN)) { + NL_SET_ERR_MSG(extack, + "goto chain not allowed on fallback"); + return -EINVAL; + } } #endif @@ -157,7 +162,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, } static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse) + u64 lastuse, bool hw) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); @@ -168,6 +173,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; + if (hw) + _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw), + bytes, packets); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -222,8 +231,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, gact_net_id); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 06a3d4801878..30b63fa23ee2 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -855,8 +855,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ife_net_id); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 23273b5303fd..8af6c11d2482 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -135,7 +135,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); - if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { + if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else @@ -329,8 +329,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -379,8 +378,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8bf66d0a6800..1dae5f2b358f 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -283,12 +283,15 @@ out: } static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse) + u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -338,8 +341,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, mirred_net_id); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4313aa102440..c5c1e23add77 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -256,28 +256,31 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, unsigned char *b = skb_tail_pointer(skb); struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { - .old_addr = p->old_addr, - .new_addr = p->new_addr, - .mask = p->mask, - .flags = p->flags, - .index = p->tcf_index, - .action = p->tcf_action, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock_bh(&p->tcf_lock); + opt.old_addr = p->old_addr; + opt.new_addr = p->new_addr; + opt.mask = p->mask; + opt.flags = p->flags; + opt.action = p->tcf_action; + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; + spin_unlock_bh(&p->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -292,8 +295,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, nat_net_id); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index ad99a99f11f6..da3dd0f68cc2 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -460,8 +460,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 5d8bfa878477..052855d47354 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,8 +22,7 @@ #include #include -struct tcf_police { - struct tc_action common; +struct tcf_police_params { int tcfp_result; u32 tcfp_ewma_rate; s64 tcfp_burst; @@ -36,6 +35,12 @@ struct tcf_police { bool rate_present; struct psched_ratecfg peak; bool peak_present; + struct rcu_head rcu; +}; + +struct tcf_police { + struct tc_action common; + struct tcf_police_params __rcu *params; }; #define to_police(pc) ((struct tcf_police *)pc) @@ -84,6 +89,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_police_params *new; bool exists = false; int size; @@ -110,7 +116,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, - &act_police_ops, bind, false); + &act_police_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; @@ -137,7 +143,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (est) { - err = gen_replace_estimator(&police->tcf_bstats, NULL, + err = gen_replace_estimator(&police->tcf_bstats, + police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, NULL, est); @@ -150,50 +157,68 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } - spin_lock_bh(&police->tcf_lock); + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (unlikely(!new)) { + err = -ENOMEM; + goto failure; + } + /* No failure allowed after this point */ - police->tcfp_mtu = parm->mtu; - if (police->tcfp_mtu == 0) { - police->tcfp_mtu = ~0; + new->tcfp_mtu = parm->mtu; + if (!new->tcfp_mtu) { + new->tcfp_mtu = ~0; if (R_tab) - police->tcfp_mtu = 255 << R_tab->rate.cell_log; + new->tcfp_mtu = 255 << R_tab->rate.cell_log; } if (R_tab) { - police->rate_present = true; - psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); + new->rate_present = true; + psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0); qdisc_put_rtab(R_tab); } else { - police->rate_present = false; + new->rate_present = false; } if (P_tab) { - police->peak_present = true; - psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); + new->peak_present = true; + psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0); qdisc_put_rtab(P_tab); } else { - police->peak_present = false; + new->peak_present = false; } - if (tb[TCA_POLICE_RESULT]) - police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); - police->tcfp_toks = police->tcfp_burst; - if (police->peak_present) { - police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, - police->tcfp_mtu); - police->tcfp_ptoks = police->tcfp_mtu_ptoks; + new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + new->tcfp_toks = new->tcfp_burst; + if (new->peak_present) { + new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, + new->tcfp_mtu); + new->tcfp_ptoks = new->tcfp_mtu_ptoks; } - police->tcf_action = parm->action; if (tb[TCA_POLICE_AVRATE]) - police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + + if (tb[TCA_POLICE_RESULT]) { + new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) { + NL_SET_ERR_MSG(extack, + "goto chain not allowed on fallback"); + err = -EINVAL; + goto failure; + } + } + spin_lock_bh(&police->tcf_lock); + new->tcfp_t_c = ktime_get_ns(); + police->tcf_action = parm->action; + rcu_swap_protected(police->params, + new, + lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); - if (ret != ACT_P_CREATED) - return ret; - police->tcfp_t_c = ktime_get_ns(); - tcf_idr_insert(tn, *a); + if (new) + kfree_rcu(new, rcu); + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); return ret; failure: @@ -207,64 +232,69 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); - s64 now; - s64 toks; - s64 ptoks = 0; + struct tcf_police_params *p; + s64 now, toks, ptoks = 0; + int ret; - spin_lock(&police->tcf_lock); - - bstats_update(&police->tcf_bstats, skb); tcf_lastuse_update(&police->tcf_tm); + bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); + + ret = READ_ONCE(police->tcf_action); + p = rcu_dereference_bh(police->params); - if (police->tcfp_ewma_rate) { + if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; if (!gen_estimator_read(&police->tcf_rate_est, &sample) || - sample.bps >= police->tcfp_ewma_rate) { - police->tcf_qstats.overlimits++; - if (police->tcf_action == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcf_action; - } + sample.bps >= p->tcfp_ewma_rate) + goto inc_overlimits; } - if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { - if (!police->rate_present) { - spin_unlock(&police->tcf_lock); - return police->tcfp_result; + if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { + if (!p->rate_present) { + ret = p->tcfp_result; + goto end; } now = ktime_get_ns(); - toks = min_t(s64, now - police->tcfp_t_c, - police->tcfp_burst); - if (police->peak_present) { - ptoks = toks + police->tcfp_ptoks; - if (ptoks > police->tcfp_mtu_ptoks) - ptoks = police->tcfp_mtu_ptoks; - ptoks -= (s64) psched_l2t_ns(&police->peak, - qdisc_pkt_len(skb)); + toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst); + if (p->peak_present) { + ptoks = toks + p->tcfp_ptoks; + if (ptoks > p->tcfp_mtu_ptoks) + ptoks = p->tcfp_mtu_ptoks; + ptoks -= (s64)psched_l2t_ns(&p->peak, + qdisc_pkt_len(skb)); } - toks += police->tcfp_toks; - if (toks > police->tcfp_burst) - toks = police->tcfp_burst; - toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); + toks += p->tcfp_toks; + if (toks > p->tcfp_burst) + toks = p->tcfp_burst; + toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - police->tcfp_t_c = now; - police->tcfp_toks = toks; - police->tcfp_ptoks = ptoks; - if (police->tcfp_result == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcfp_result; + p->tcfp_t_c = now; + p->tcfp_toks = toks; + p->tcfp_ptoks = ptoks; + ret = p->tcfp_result; + goto inc_drops; } } - police->tcf_qstats.overlimits++; - if (police->tcf_action == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcf_action; +inc_overlimits: + qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats)); +inc_drops: + if (ret == TC_ACT_SHOT) + qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats)); +end: + return ret; +} + +static void tcf_police_cleanup(struct tc_action *a) +{ + struct tcf_police *police = to_police(a); + struct tcf_police_params *p; + + p = rcu_dereference_protected(police->params, 1); + if (p) + kfree_rcu(p, rcu); } static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, @@ -272,6 +302,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); + struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -281,19 +312,21 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, spin_lock_bh(&police->tcf_lock); opt.action = police->tcf_action; - opt.mtu = police->tcfp_mtu; - opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); - if (police->rate_present) - psched_ratecfg_getrate(&opt.rate, &police->rate); - if (police->peak_present) - psched_ratecfg_getrate(&opt.peakrate, &police->peak); + p = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + opt.mtu = p->tcfp_mtu; + opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); + if (p->rate_present) + psched_ratecfg_getrate(&opt.rate, &p->rate); + if (p->peak_present) + psched_ratecfg_getrate(&opt.peakrate, &p->peak); if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; - if (police->tcfp_result && - nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) + if (p->tcfp_result && + nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result)) goto nla_put_failure; - if (police->tcfp_ewma_rate && - nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) + if (p->tcfp_ewma_rate && + nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); @@ -312,8 +345,7 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, police_net_id); @@ -333,6 +365,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_police_init, .walk = tcf_police_walker, .lookup = tcf_police_search, + .cleanup = tcf_police_cleanup, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 6b67aa13d2dd..1a0c682fd734 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -224,8 +224,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, sample_net_id); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 52400d49f81f..902957beceb3 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -188,8 +188,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, simp_net_id); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 73e44ce2a883..64dba3708fce 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -99,7 +99,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - struct tcf_skbedit_params *params_old, *params_new; + struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -187,8 +187,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } } - ASSERT_RTNL(); - params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { if (ret == ACT_P_CREATED) @@ -210,11 +208,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + spin_lock_bh(&d->tcf_lock); d->tcf_action = parm->action; - params_old = rtnl_dereference(d->params); - rcu_assign_pointer(d->params, params_new); - if (params_old) - kfree_rcu(params_old, rcu); + rcu_swap_protected(d->params, params_new, + lockdep_is_held(&d->tcf_lock)); + spin_unlock_bh(&d->tcf_lock); + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -231,12 +231,14 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; u64 pure_flags = 0; struct tcf_t t; - params = rtnl_dereference(d->params); + spin_lock_bh(&d->tcf_lock); + params = rcu_dereference_protected(d->params, + lockdep_is_held(&d->tcf_lock)); + opt.action = d->tcf_action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -264,9 +266,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -291,8 +296,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 588077fafd6c..59710a183bd3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -251,8 +251,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 681f6f04e7da..4cca8f274662 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -548,8 +548,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 033d273afe50..ba677d54a7af 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -288,8 +288,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, - struct netlink_ext_ack *extack) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, vlan_net_id); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0a75cb2e5e7b..f427a1e00e7e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,6 +31,8 @@ #include #include +extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; + /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); @@ -240,8 +242,8 @@ static void tcf_chain_destroy(struct tcf_chain *chain) if (!chain->index) block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list) && block->refcnt == 0) - kfree(block); + if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt)) + kfree_rcu(block, rcu); } static void tcf_chain_hold(struct tcf_chain *chain) @@ -473,6 +475,7 @@ tcf_chain0_head_change_cb_del(struct tcf_block *block, } struct tcf_net { + spinlock_t idr_lock; /* Protects idr */ struct idr idr; }; @@ -482,16 +485,25 @@ static int tcf_block_insert(struct tcf_block *block, struct net *net, struct netlink_ext_ack *extack) { struct tcf_net *tn = net_generic(net, tcf_net_id); + int err; + + idr_preload(GFP_KERNEL); + spin_lock(&tn->idr_lock); + err = idr_alloc_u32(&tn->idr, block, &block->index, block->index, + GFP_NOWAIT); + spin_unlock(&tn->idr_lock); + idr_preload_end(); - return idr_alloc_u32(&tn->idr, block, &block->index, block->index, - GFP_KERNEL); + return err; } static void tcf_block_remove(struct tcf_block *block, struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); + spin_lock(&tn->idr_lock); idr_remove(&tn->idr, block->index); + spin_unlock(&tn->idr_lock); } static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, @@ -510,7 +522,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->owner_list); INIT_LIST_HEAD(&block->chain0.filter_chain_list); - block->refcnt = 1; + refcount_set(&block->refcnt, 1); block->net = net; block->index = block_index; @@ -527,6 +539,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) return idr_find(&tn->idr, block_index); } +static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) +{ + struct tcf_block *block; + + rcu_read_lock(); + block = tcf_block_lookup(net, block_index); + if (block && !refcount_inc_not_zero(&block->refcnt)) + block = NULL; + rcu_read_unlock(); + + return block; +} + +static void tcf_block_flush_all_chains(struct tcf_block *block) +{ + struct tcf_chain *chain; + + /* Hold a refcnt for all chains, so that they don't disappear + * while we are iterating. + */ + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_hold(chain); + + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_flush(chain); +} + +static void tcf_block_put_all_chains(struct tcf_block *block) +{ + struct tcf_chain *chain, *tmp; + + /* At this point, all the chains should have refcnt >= 1. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); + tcf_chain_put(chain); + } +} + +static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + if (refcount_dec_and_test(&block->refcnt)) { + /* Flushing/putting all chains will cause the block to be + * deallocated when last chain is freed. However, if chain_list + * is empty, block has to be manually deallocated. After block + * reference counter reached 0, it is no longer possible to + * increment it or add new chains to block. + */ + bool free_block = list_empty(&block->chain_list); + + if (tcf_block_shared(block)) + tcf_block_remove(block, block->net); + if (!free_block) + tcf_block_flush_all_chains(block); + + if (q) + tcf_block_offload_unbind(block, q, ei); + + if (free_block) + kfree_rcu(block, rcu); + else + tcf_block_put_all_chains(block); + } else if (q) { + tcf_block_offload_unbind(block, q, ei); + } +} + +static void tcf_block_refcnt_put(struct tcf_block *block) +{ + __tcf_block_put(block, NULL, NULL); +} + /* Find tcf block. * Set q, parent, cl when appropriate. */ @@ -537,9 +621,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, struct netlink_ext_ack *extack) { struct tcf_block *block; + int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, block_index); + block = tcf_block_refcnt_get(net, block_index); if (!block) { NL_SET_ERR_MSG(extack, "Block of given index was not found"); return ERR_PTR(-EINVAL); @@ -548,55 +633,106 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, const struct Qdisc_class_ops *cops; struct net_device *dev; + rcu_read_lock(); + /* Find link */ - dev = __dev_get_by_index(net, ifindex); - if (!dev) + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); return ERR_PTR(-ENODEV); + } /* Find qdisc */ if (!*parent) { *q = dev->qdisc; *parent = (*q)->handle; } else { - *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); + *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } } + *q = qdisc_refcount_inc_nz(*q); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + err = -EINVAL; + goto errout_rcu; + } + /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_rcu; } + /* At this point we know that qdisc is not noop_qdisc, + * which means that qdisc holds a reference to net_device + * and we hold a reference to qdisc, so it is safe to release + * rcu read lock. + */ + rcu_read_unlock(); + /* Do we search for filter, attached to class? */ if (TC_H_MIN(*parent)) { *cl = cops->find(*q, *parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto errout_qdisc; } } /* And the last stroke */ block = cops->tcf_block(*q, *cl, extack); - if (!block) - return ERR_PTR(-EINVAL); + if (!block) { + err = -EINVAL; + goto errout_qdisc; + } if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_qdisc; } + + /* Always take reference to block in order to support execution + * of rules update path of cls API without rtnl lock. Caller + * must release block when it is finished using it. 'if' block + * of this conditional obtain reference to block by calling + * tcf_block_refcnt_get(). + */ + refcount_inc(&block->refcnt); } return block; + +errout_rcu: + rcu_read_unlock(); +errout_qdisc: + if (*q) { + qdisc_put(*q); + *q = NULL; + } + return ERR_PTR(err); +} + +static void tcf_block_release(struct Qdisc *q, struct tcf_block *block) +{ + if (!IS_ERR_OR_NULL(block)) + tcf_block_refcnt_put(block); + + if (q) + qdisc_put(q); } struct tcf_block_owner_item { @@ -664,21 +800,16 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, { struct net *net = qdisc_net(q); struct tcf_block *block = NULL; - bool created = false; int err; - if (ei->block_index) { + if (ei->block_index) /* block_index not 0 means the shared block is requested */ - block = tcf_block_lookup(net, ei->block_index); - if (block) - block->refcnt++; - } + block = tcf_block_refcnt_get(net, ei->block_index); if (!block) { block = tcf_block_create(net, q, ei->block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); - created = true; if (tcf_block_shared(block)) { err = tcf_block_insert(block, net, extack); if (err) @@ -708,14 +839,8 @@ err_block_offload_bind: err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: - if (created) { - if (tcf_block_shared(block)) - tcf_block_remove(block, net); err_block_insert: - kfree(block); - } else { - block->refcnt--; - } + tcf_block_refcnt_put(block); return err; } EXPORT_SYMBOL(tcf_block_get_ext); @@ -747,42 +872,12 @@ EXPORT_SYMBOL(tcf_block_get); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { - struct tcf_chain *chain, *tmp; - if (!block) return; tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (block->refcnt == 1) { - if (tcf_block_shared(block)) - tcf_block_remove(block, block->net); - - /* Hold a refcnt for all chains, so that they don't disappear - * while we are iterating. - */ - list_for_each_entry(chain, &block->chain_list, list) - tcf_chain_hold(chain); - - list_for_each_entry(chain, &block->chain_list, list) - tcf_chain_flush(chain); - } - - tcf_block_offload_unbind(block, q, ei); - - if (block->refcnt == 1) { - /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { - tcf_chain_put_explicitly_created(chain); - tcf_chain_put(chain); - } - - block->refcnt--; - if (list_empty(&block->chain_list)) - kfree(block); - } else { - block->refcnt--; - } + __tcf_block_put(block, q, ei); } EXPORT_SYMBOL(tcf_block_put_ext); @@ -1211,7 +1306,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, replay: tp_created = 0; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1332,6 +1427,7 @@ replay: errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1360,7 +1456,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1453,6 +1549,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1475,7 +1572,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *fh = NULL; int err; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1538,6 +1635,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1631,12 +1729,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, + cb->extack); if (err) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, tcm->tcm_block_index); + block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value @@ -1695,6 +1794,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) } } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) + tcf_block_refcnt_put(block); cb->args[0] = index; out: @@ -1838,7 +1939,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1854,7 +1955,8 @@ replay: chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { @@ -1866,23 +1968,27 @@ replay: tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; + err = -EEXIST; + goto errout_block; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; + err = -ENOENT; + goto errout_block; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + err = -ENOMEM; + goto errout_block; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } tcf_chain_hold(chain); } @@ -1926,6 +2032,8 @@ replay: errout: tcf_chain_put(chain); +errout_block: + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1949,12 +2057,13 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + cb->extack); if (err) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, tcm->tcm_block_index); + block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value @@ -2021,6 +2130,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) + tcf_block_refcnt_put(block); cb->args[0] = index; out: @@ -2213,6 +2324,7 @@ static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); + spin_lock_init(&tn->idr_lock); idr_init(&tn->idr); return 0; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6fd9bdd93796..9aada2d0ef06 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -98,7 +98,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - unsigned int in_hw_count; + u32 in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -993,7 +993,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) } #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) -#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member) #define FL_KEY_IS_MASKED(mask, member) \ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index f218ccf1e2d9..4b28fd44576d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -68,7 +68,6 @@ struct tc_u_knode { u32 mask; u32 __percpu *pcpu_success; #endif - struct tcf_proto *tp; struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. @@ -80,10 +79,10 @@ struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; - struct tc_u_common *tp_c; int refcnt; unsigned int divisor; struct idr handle_idr; + bool is_root; struct rcu_head rcu; u32 flags; /* The 'ht' field MUST be the last field in structure to allow for @@ -98,7 +97,7 @@ struct tc_u_common { int refcnt; struct idr handle_idr; struct hlist_node hnode; - struct rcu_head rcu; + long knodes; }; static inline unsigned int u32_hash_fold(__be32 key, @@ -344,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp) return block->q; } -static unsigned int tc_u_hash(const struct tcf_proto *tp) +static struct hlist_head *tc_u_hash(void *key) { - return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT); + return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT); } -static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) +static struct tc_u_common *tc_u_common_find(void *key) { struct tc_u_common *tc; - unsigned int h; - - h = tc_u_hash(tp); - hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->ptr == tc_u_common_ptr(tp)) + hlist_for_each_entry(tc, tc_u_hash(key), hnode) { + if (tc->ptr == key) return tc; } return NULL; @@ -365,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) static int u32_init(struct tcf_proto *tp) { struct tc_u_hnode *root_ht; - struct tc_u_common *tp_c; - unsigned int h; - - tp_c = tc_u_common_find(tp); + void *key = tc_u_common_ptr(tp); + struct tc_u_common *tp_c = tc_u_common_find(key); root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL); if (root_ht == NULL) @@ -377,6 +371,7 @@ static int u32_init(struct tcf_proto *tp) root_ht->refcnt++; root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + root_ht->is_root = true; idr_init(&root_ht->handle_idr); if (tp_c == NULL) { @@ -385,26 +380,24 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->ptr = tc_u_common_ptr(tp); + tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); - h = tc_u_hash(tp); - hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); + hlist_add_head(&tp_c->hnode, tc_u_hash(key)); } tp_c->refcnt++; RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); - root_ht->tp_c = tp_c; + root_ht->refcnt++; rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } -static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, - bool free_pf) +static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); @@ -438,7 +431,7 @@ static void u32_delete_key_work(struct work_struct *work) struct tc_u_knode, rwork); rtnl_lock(); - u32_destroy_key(key->tp, key, false); + u32_destroy_key(key, false); rtnl_unlock(); } @@ -455,12 +448,13 @@ static void u32_delete_key_freepf_work(struct work_struct *work) struct tc_u_knode, rwork); rtnl_lock(); - u32_destroy_key(key->tp, key, true); + u32_destroy_key(key, true); rtnl_unlock(); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { + struct tc_u_common *tp_c = tp->data; struct tc_u_knode __rcu **kp; struct tc_u_knode *pkp; struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); @@ -471,6 +465,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) kp = &pkp->next, pkp = rtnl_dereference(*kp)) { if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); + tp_c->knodes--; tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); @@ -585,6 +580,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { + struct tc_u_common *tp_c = tp->data; struct tc_u_knode *n; unsigned int h; @@ -592,13 +588,14 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); + tp_c->knodes--; tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else - u32_destroy_key(n->tp, n, true); + u32_destroy_key(n, true); } } } @@ -610,7 +607,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct tc_u_hnode __rcu **hn; struct tc_u_hnode *phn; - WARN_ON(ht->refcnt); + WARN_ON(--ht->refcnt); u32_clear_hnode(tp, ht, extack); @@ -631,17 +628,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, return -ENOENT; } -static bool ht_empty(struct tc_u_hnode *ht) -{ - unsigned int h; - - for (h = 0; h <= ht->divisor; h++) - if (rcu_access_pointer(ht->ht[h])) - return false; - - return true; -} - static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; @@ -649,7 +635,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) WARN_ON(root_ht == NULL); - if (root_ht && --root_ht->refcnt == 0) + if (root_ht && --root_ht->refcnt == 1) u32_destroy_hnode(tp, root_ht, extack); if (--tp_c->refcnt == 0) { @@ -679,26 +665,21 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; - struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); struct tc_u_common *tp_c = tp->data; int ret = 0; - if (ht == NULL) - goto out; - if (TC_U32_KEY(ht->handle)) { u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); ret = u32_delete_key(tp, (struct tc_u_knode *)ht); goto out; } - if (root_ht == ht) { + if (ht->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; } if (ht->refcnt == 1) { - ht->refcnt--; u32_destroy_hnode(tp, ht, extack); } else { NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); @@ -706,38 +687,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, } out: - *last = true; - if (root_ht) { - if (root_ht->refcnt > 1) { - *last = false; - goto ret; - } - if (root_ht->refcnt == 1) { - if (!ht_empty(root_ht)) { - *last = false; - goto ret; - } - } - } - - if (tp_c->refcnt > 1) { - *last = false; - goto ret; - } - - if (tp_c->refcnt == 1) { - struct tc_u_hnode *ht; - - for (ht = rtnl_dereference(tp_c->hlist); - ht; - ht = rtnl_dereference(ht->next)) - if (!ht_empty(ht)) { - *last = false; - break; - } - } - -ret: + *last = tp_c->refcnt == 1 && tp_c->knodes == 0; return ret; } @@ -768,7 +718,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { }; static int u32_set_parms(struct net *net, struct tcf_proto *tp, - unsigned long base, struct tc_u_hnode *ht, + unsigned long base, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) @@ -789,12 +739,16 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, } if (handle) { - ht_down = u32_lookup_ht(ht->tp_c, handle); + ht_down = u32_lookup_ht(tp->data, handle); if (!ht_down) { NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; } + if (ht_down->is_root) { + NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); + return -EINVAL; + } ht_down->refcnt++; } @@ -891,7 +845,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, /* Similarly success statistics must be moved as pointers */ new->pcpu_success = n->pcpu_success; #endif - new->tp = tp; memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { @@ -960,18 +913,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOMEM; - err = u32_set_parms(net, tp, base, - rtnl_dereference(n->ht_up), new, tb, + err = u32_set_parms(net, tp, base, new, tb, tca[TCA_RATE], ovr, extack); if (err) { - u32_destroy_key(tp, new, false); + u32_destroy_key(new, false); return err; } err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { - u32_destroy_key(tp, new, false); + u32_destroy_key(new, false); return err; } @@ -988,7 +940,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); - if (--divisor > 0x100) { + if (!is_power_of_2(divisor)) { + NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2"); + return -EINVAL; + } + if (divisor-- > 0x100) { NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; } @@ -1013,7 +969,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } } - ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; @@ -1103,7 +1058,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - n->tp = tp; err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) @@ -1125,7 +1079,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr, + err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr, extack); if (err == 0) { struct tc_u_knode __rcu **ins; @@ -1146,6 +1100,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); + tp_c->knodes++; *arg = n; return 0; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..ca3b0f46de53 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -315,6 +314,24 @@ out: return q; } +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) +{ + struct netdev_queue *nq; + struct Qdisc *q; + + if (!handle) + return NULL; + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + nq = dev_ingress_queue_rcu(dev); + if (nq) + q = qdisc_match_from_root(nq->qdisc_sleeping, handle); +out: + return q; +} + static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; @@ -921,7 +938,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, qdisc_notify(net, skb, n, clid, old, new); if (old) - qdisc_destroy(old); + qdisc_put(old); } /* Graft qdisc "new" to class "classid" of qdisc "parent" or @@ -974,7 +991,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_refcount_inc(new); if (!ingress) - qdisc_destroy(old); + qdisc_put(old); } skip: @@ -1053,10 +1070,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, return 0; } -/* lockdep annotation is needed for ingress; egress gets it only for name */ -static struct lock_class_key qdisc_tx_lock; -static struct lock_class_key qdisc_rx_lock; - /* Allocate and initialize new qdisc. @@ -1121,7 +1134,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; handle = TC_H_MAKE(TC_H_INGRESS, 0); - lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); @@ -1129,7 +1141,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == 0) goto err_out3; } - lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } @@ -1307,6 +1318,17 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) return 0; } +const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { + [TCA_KIND] = { .type = NLA_STRING }, + [TCA_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct tc_estimator) }, + [TCA_STAB] = { .type = NLA_NESTED }, + [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, + [TCA_CHAIN] = { .type = NLA_U32 }, + [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, + [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, +}; + /* * Delete/get qdisc. */ @@ -1327,7 +1349,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1411,7 +1434,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1568,7 +1592,7 @@ graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) - qdisc_destroy(q); + qdisc_put(q); return err; } @@ -1645,7 +1669,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, + rtm_tca_policy, cb->extack); if (err < 0) return err; @@ -1864,7 +1889,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -2043,7 +2069,8 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); - if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + if (q && q != root && + tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index cd49afca9617..d714d3747bcb 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) pr_debug("atm_tc_put: destroying\n"); list_del_init(&flow->list); pr_debug("atm_tc_put: qdisc %p\n", flow->q); - qdisc_destroy(flow->q); + qdisc_put(flow->q); tcf_block_put(flow->block); if (flow->sock) { pr_debug("atm_tc_put: f_count %ld\n", diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index c07c30b916d5..b910cd5c56f7 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -812,7 +812,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) if (skb) { flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); } return skb; @@ -1252,7 +1252,7 @@ found: else flow->head = elig_ack->next; - elig_ack->next = NULL; + skb_mark_not_on_list(elig_ack); return elig_ack; } @@ -1675,7 +1675,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, while (segs) { nskb = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, @@ -2644,7 +2644,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, for (i = 1; i <= CAKE_QUEUES; i++) quantum_div[i] = 65535 / i; - q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), + q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), GFP_KERNEL); if (!q->tins) goto nomem; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index f42025d53cfe..4dc05409e3fb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) WARN_ON(cl->filters); tcf_block_put(cl->block); - qdisc_destroy(cl->q); + qdisc_put(cl->q); qdisc_put_rtab(cl->R_tab); gen_kill_estimator(&cl->rate_est); if (cl != &q->link) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index e26a24017faa..e689e11b6d0f 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -379,7 +379,7 @@ static void cbs_destroy(struct Qdisc *sch) cbs_disable_offload(dev, q); if (q->qdisc) - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e0b0cf8a9939..cdebaed0f8cf 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); return err; } @@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { gen_kill_estimator(&cl->rate_est); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 049714c57075..f6f480784bc6 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); tcf_block_put(p->block); - qdisc_destroy(p->q); + qdisc_put(p->q); if (p->mv != p->embedded) kfree(p->mv); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 24893d3b5d22..3809c9bf8896 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, if (q) { err = fifo_set_limit(q, limit); if (err < 0) { - qdisc_destroy(q); + qdisc_put(q); q = NULL; } } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4808713c73b9..4b1af706896c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -92,8 +92,8 @@ struct fq_sched_data { u32 quantum; u32 initial_quantum; u32 flow_refill_delay; - u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ + unsigned long flow_max_rate; /* optional max rate per flow */ u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; @@ -106,7 +106,6 @@ struct fq_sched_data { u64 stat_gc_flows; u64 stat_internal_packets; - u64 stat_tcp_retrans; u64 stat_throttled; u64 stat_flows_plimit; u64 stat_pkts_too_long; @@ -319,7 +318,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) if (skb) { flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) return skb; } -/* We might add in the future detection of retransmits - * For the time being, just return false - */ -static bool skb_is_retransmit(struct sk_buff *skb) -{ - return false; -} - -/* add skb to flow queue - * flow queue is a linked list, kind of FIFO, except for TCP retransmits - * We special case tcp retransmits to be transmitted before other packets. - * We rely on fact that TCP retransmits are unlikely, so we do not waste - * a separate queue or a pointer. - * head-> [retrans pkt 1] - * [retrans pkt 2] - * [ normal pkt 1] - * [ normal pkt 2] - * [ normal pkt 3] - * tail-> [ normal pkt 4] - */ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) { - struct sk_buff *prev, *head = flow->head; + struct sk_buff *head = flow->head; skb->next = NULL; - if (!head) { + if (!head) flow->head = skb; - flow->tail = skb; - return; - } - if (likely(!skb_is_retransmit(skb))) { + else flow->tail->next = skb; - flow->tail = skb; - return; - } - /* This skb is a tcp retransmit, - * find the last retrans packet in the queue - */ - prev = NULL; - while (skb_is_retransmit(head)) { - prev = head; - head = head->next; - if (!head) - break; - } - if (!prev) { /* no rtx packet in queue, become the new head */ - skb->next = flow->head; - flow->head = skb; - } else { - if (prev == flow->tail) - flow->tail = skb; - else - skb->next = prev->next; - prev->next = skb; - } + flow->tail = skb; } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } f->qlen++; - if (skb_is_retransmit(skb)) - q->stat_tcp_retrans++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { struct sock *sk = skb->sk; @@ -464,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; - u32 rate, plen; + unsigned long rate; + u32 plen; skb = fq_dequeue_head(sch, &q->internal); if (skb) @@ -491,11 +444,16 @@ begin: } skb = f->head; - if (unlikely(skb && now < f->time_next_packet && - !skb_is_tcp_pure_ack(skb))) { - head->first = f->next; - fq_flow_set_throttled(q, f); - goto begin; + if (skb) { + u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), + f->time_next_packet); + + if (now < time_next_packet) { + head->first = f->next; + f->time_next_packet = time_next_packet; + fq_flow_set_throttled(q, f); + goto begin; + } } skb = fq_dequeue_head(sch, f); @@ -513,11 +471,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (!q->rate_enable) - goto out; - - /* Do not pace locally generated ack packets */ - if (skb_is_tcp_pure_ack(skb)) + if (ktime_to_ns(skb->tstamp) || !q->rate_enable) goto out; rate = q->flow_max_rate; @@ -532,11 +486,11 @@ begin: if (f->credit > 0) goto out; } - if (rate != ~0U) { + if (rate != ~0UL) { u64 len = (u64)plen * NSEC_PER_SEC; if (likely(rate)) - do_div(len, rate); + len = div64_ul(len, rate); /* Since socket rate can change later, * clamp the delay to 1 second. * Really, providers of too big packets should be fixed ! @@ -748,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); - if (tb[TCA_FQ_FLOW_MAX_RATE]) - q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + if (tb[TCA_FQ_FLOW_MAX_RATE]) { + u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; + } if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) q->low_rate_threshold = nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); @@ -813,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->quantum = 2 * psched_mtu(qdisc_dev(sch)); q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->flow_refill_delay = msecs_to_jiffies(40); - q->flow_max_rate = ~0U; + q->flow_max_rate = ~0UL; q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; q->new_flows.first = NULL; @@ -823,7 +779,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; - qdisc_watchdog_init(&q->watchdog, sch); + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) err = fq_change(sch, opt, extack); @@ -849,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || - nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, + min_t(unsigned long, q->flow_max_rate, ~0U)) || nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || @@ -873,7 +830,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.gc_flows = q->stat_gc_flows; st.highprio_packets = q->stat_internal_packets; - st.tcp_retrans = q->stat_tcp_retrans; + st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; st.pkts_too_long = q->stat_pkts_too_long; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6c0a9d5dbf94..cd04d40c30b6 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) struct sk_buff *skb = flow->head; flow->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); return skb; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 69078c82963e..de1663f7d3ad 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, skb = nskb; (*packets)++; /* GSO counts as one pkt */ } - skb->next = NULL; + skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure @@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, skb = nskb; } while (++cnt < 8); (*packets) += cnt; - skb->next = NULL; + skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). @@ -572,6 +572,18 @@ struct Qdisc noop_qdisc = { .dev_queue = &noop_netdev_queue, .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, + .prev = (struct sk_buff *)&noop_qdisc.gso_skb, + .qlen = 0, + .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), + }, + .skb_bad_txq = { + .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, + .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, + .qlen = 0, + .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), + }, }; EXPORT_SYMBOL(noop_qdisc); @@ -901,7 +913,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, if (!ops->init || ops->init(sch, NULL, extack) == 0) return sch; - qdisc_destroy(sch); + qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); @@ -941,15 +953,18 @@ void qdisc_free(struct Qdisc *qdisc) kfree((char *) qdisc - qdisc->padded); } -void qdisc_destroy(struct Qdisc *qdisc) +static void qdisc_free_cb(struct rcu_head *head) +{ + struct Qdisc *q = container_of(head, struct Qdisc, rcu); + + qdisc_free(q); +} + +static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; - if (qdisc->flags & TCQ_F_BUILTIN || - !refcount_dec_and_test(&qdisc->refcnt)) - return; - #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -974,9 +989,34 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(skb); } - qdisc_free(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); +} + +void qdisc_put(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN || + !refcount_dec_and_test(&qdisc->refcnt)) + return; + + qdisc_destroy(qdisc); +} +EXPORT_SYMBOL(qdisc_put); + +/* Version of qdisc_put() that is called with rtnl mutex unlocked. + * Intended to be used as optimization, this function only takes rtnl lock if + * qdisc reference counter reached zero. + */ + +void qdisc_put_unlocked(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN || + !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) + return; + + qdisc_destroy(qdisc); + rtnl_unlock(); } -EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, @@ -1245,8 +1285,6 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; - __skb_queue_head_init(&qdisc->gso_skb); - __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) @@ -1270,7 +1308,7 @@ static void shutdown_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc_default); dev_queue->qdisc_sleeping = qdisc_default; - qdisc_destroy(qdisc); + qdisc_put(qdisc); } } @@ -1279,7 +1317,7 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_destroy(dev->qdisc); + qdisc_put(dev->qdisc); dev->qdisc = &noop_qdisc; WARN_ON(timer_pending(&dev->watchdog_timer)); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index cbe4831f46f4..4a042abf844c 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -413,7 +413,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, opt); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } if (tb[TCA_GRED_PARMS] == NULL || diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3278a76f6861..b18ec1f6de60 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) struct hfsc_sched *q = qdisc_priv(sch); tcf_block_put(cl->block); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); gen_kill_estimator(&cl->rate_est); if (cl != &q->root) kfree(cl); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c3a8388dcdf6..9d6a47697406 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) struct sk_buff *skb = bucket->head; bucket->head = skb->next; - skb->next = NULL; + skb_mark_not_on_list(skb); return skb; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 43c4bfe625a9..58b449490757 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -132,7 +132,7 @@ struct htb_class { struct htb_class_inner { struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; - } un; + }; s64 pq_key; int prio_activity; /* for which prios are we active */ @@ -411,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].feed.rb_node) + if (p->inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); + htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -447,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].ptr == cl->node + prio) { + if (p->inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; - p->un.inner.clprio[prio].ptr = NULL; + p->inner.clprio[prio].last_ptr_id = cl->common.classid; + p->inner.clprio[prio].ptr = NULL; } htb_safe_rb_erase(cl->node + prio, - &p->un.inner.clprio[prio].feed); + &p->inner.clprio[prio].feed); - if (!p->un.inner.clprio[prio].feed.rb_node) + if (!p->inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -555,7 +555,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; @@ -577,22 +577,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) cl->prio_activity = 0; } -static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, - struct qdisc_skb_head *qh) -{ - struct sk_buff *last = qh->tail; - - if (last) { - skb->next = NULL; - last->next = skb; - qh->tail = skb; - } else { - qh->tail = skb; - qh->head = skb; - } - qh->qlen++; -} - static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -603,7 +587,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == HTB_DIRECT) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen) { - htb_enqueue_tail(skb, sch, &q->direct_queue); + __qdisc_enqueue_tail(skb, &q->direct_queue); q->direct_pkts++; } else { return qdisc_drop(skb, sch, to_free); @@ -615,7 +599,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + } else if ((ret = qdisc_enqueue(skb, cl->leaf.q, to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); @@ -823,7 +807,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - clp = &cl->un.inner.clprio[prio]; + clp = &cl->inner.clprio[prio]; (++sp)->root = clp->feed.rb_node; sp->pptr = &clp->ptr; sp->pid = &clp->last_ptr_id; @@ -857,7 +841,7 @@ next: * graft operation on the leaf since last dequeue; * simply deactivate and skip such class */ - if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + if (unlikely(cl->leaf.q->q.qlen == 0)) { struct htb_class *next; htb_deactivate(q, cl); @@ -873,12 +857,12 @@ next: goto next; } - skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); + skb = cl->leaf.q->dequeue(cl->leaf.q); if (likely(skb != NULL)) break; - qdisc_warn_nonwc("htb", cl->un.leaf.q); - htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + qdisc_warn_nonwc("htb", cl->leaf.q); + htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr: &q->hlevel[0].hprio[prio].ptr); cl = htb_lookup_leaf(hprio, prio); @@ -886,16 +870,16 @@ next: if (likely(skb != NULL)) { bstats_update(&cl->bstats, skb); - cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); - if (cl->un.leaf.deficit[level] < 0) { - cl->un.leaf.deficit[level] += cl->quantum; - htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + cl->leaf.deficit[level] -= qdisc_pkt_len(skb); + if (cl->leaf.deficit[level] < 0) { + cl->leaf.deficit[level] += cl->quantum; + htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr : &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation * gives us slightly better performance */ - if (!cl->un.leaf.q->q.qlen) + if (!cl->leaf.q->q.qlen) htb_deactivate(q, cl); htb_charge_class(q, cl, level, skb); } @@ -972,10 +956,10 @@ static void htb_reset(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) - memset(&cl->un.inner, 0, sizeof(cl->un.inner)); + memset(&cl->inner, 0, sizeof(cl->inner)); else { - if (cl->un.leaf.q) - qdisc_reset(cl->un.leaf.q); + if (cl->leaf.q) + qdisc_reset(cl->leaf.q); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -1098,8 +1082,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, */ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; tcm->tcm_handle = cl->common.classid; - if (!cl->level && cl->un.leaf.q) - tcm->tcm_info = cl->un.leaf.q->handle; + if (!cl->level && cl->leaf.q) + tcm->tcm_info = cl->leaf.q->handle; nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -1142,9 +1126,9 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) }; __u32 qlen = 0; - if (!cl->level && cl->un.leaf.q) { - qlen = cl->un.leaf.q->q.qlen; - qs.backlog = cl->un.leaf.q->qstats.backlog; + if (!cl->level && cl->leaf.q) { + qlen = cl->leaf.q->q.qlen; + qs.backlog = cl->leaf.q->qstats.backlog; } cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), INT_MIN, INT_MAX); @@ -1172,14 +1156,14 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, cl->common.classid, extack)) == NULL) return -ENOBUFS; - *old = qdisc_replace(sch, new, &cl->un.leaf.q); + *old = qdisc_replace(sch, new, &cl->leaf.q); return 0; } static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - return !cl->level ? cl->un.leaf.q : NULL; + return !cl->level ? cl->leaf.q : NULL; } static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -1205,15 +1189,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, { struct htb_class *parent = cl->parent; - WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); + WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity); if (parent->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&parent->pq_node, &q->hlevel[parent->level].wait_pq); parent->level = 0; - memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - parent->un.leaf.q = new_q ? new_q : &noop_qdisc; + memset(&parent->inner, 0, sizeof(parent->inner)); + parent->leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; parent->t_c = ktime_get_ns(); @@ -1223,8 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) { if (!cl->level) { - WARN_ON(!cl->un.leaf.q); - qdisc_destroy(cl->un.leaf.q); + WARN_ON(!cl->leaf.q); + qdisc_put(cl->leaf.q); } gen_kill_estimator(&cl->rate_est); tcf_block_put(cl->block); @@ -1286,11 +1270,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); if (!cl->level) { - unsigned int qlen = cl->un.leaf.q->q.qlen; - unsigned int backlog = cl->un.leaf.q->qstats.backlog; + unsigned int qlen = cl->leaf.q->q.qlen; + unsigned int backlog = cl->leaf.q->qstats.backlog; - qdisc_reset(cl->un.leaf.q); - qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); + qdisc_reset(cl->leaf.q); + qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog); } /* delete from hash and active; remainder in destroy_class */ @@ -1419,13 +1403,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, classid, NULL); sch_tree_lock(sch); if (parent && !parent->level) { - unsigned int qlen = parent->un.leaf.q->q.qlen; - unsigned int backlog = parent->un.leaf.q->qstats.backlog; + unsigned int qlen = parent->leaf.q->q.qlen; + unsigned int backlog = parent->leaf.q->qstats.backlog; /* turn parent into inner node */ - qdisc_reset(parent->un.leaf.q); - qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); - qdisc_destroy(parent->un.leaf.q); + qdisc_reset(parent->leaf.q); + qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog); + qdisc_put(parent->leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); @@ -1436,10 +1420,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } parent->level = (parent->parent ? parent->parent->level : TC_HTB_MAXDEPTH) - 1; - memset(&parent->un.inner, 0, sizeof(parent->un.inner)); + memset(&parent->inner, 0, sizeof(parent->inner)); } /* leaf (we) needs elementary qdisc */ - cl->un.leaf.q = new_q ? new_q : &noop_qdisc; + cl->leaf.q = new_q ? new_q : &noop_qdisc; cl->common.classid = classid; cl->parent = parent; @@ -1455,8 +1439,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_class_hash_insert(&q->clhash, &cl->common); if (parent) parent->children++; - if (cl->un.leaf.q != &noop_qdisc) - qdisc_hash_add(cl->un.leaf.q, true); + if (cl->leaf.q != &noop_qdisc) + qdisc_hash_add(cl->leaf.q, true); } else { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, @@ -1478,7 +1462,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); /* it used to be a nasty bug here, we have to check that node - * is really leaf before changing cl->un.leaf ! + * is really leaf before changing cl->leaf ! */ if (!cl->level) { u64 quantum = cl->rate.rate_bytes_ps; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index d6b8ae4ed7a3..f20f3a0f8424 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch) if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) - qdisc_destroy(priv->qdiscs[ntx]); + qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } @@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch) qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) - qdisc_destroy(old); + qdisc_put(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 0e9d761cdd80..d364e63c396d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) - qdisc_destroy(priv->qdiscs[ntx]); + qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } @@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch) qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) - qdisc_destroy(old); + qdisc_put(old); if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 1da7ea8de0ad..7410ce4d0321 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch) tcf_block_put(q->block); for (band = 0; band < q->bands; band++) - qdisc_destroy(q->queues[band]); + qdisc_put(q->queues[band]); kfree(q->queues); } @@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, q->queues[i] = &noop_qdisc; qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_destroy(child); + qdisc_put(child); } } @@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); - qdisc_destroy(old); + qdisc_put(old); } sch_tree_unlock(sch); } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ad18a2052416..57b3ad9394ad 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -412,16 +412,6 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, return segs; } -static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) -{ - skb->next = qh->head; - - if (!qh->head) - qh->tail = skb; - qh->head = skb; - qh->qlen++; -} - /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -570,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb->time_to_send = ktime_get_ns(); q->counter = 0; - netem_enqueue_skb_head(&sch->q, skb); + __qdisc_enqueue_head(skb, &sch->q); sch->qstats.requeues++; } @@ -578,7 +568,7 @@ finish_segs: if (segs) { while (segs) { skb2 = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; rc = qdisc_enqueue(segs, sch, to_free); @@ -1032,7 +1022,7 @@ static void netem_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); if (q->qdisc) - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); dist_free(q->delay_dist); dist_free(q->slot_dist); } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 18d30bb86881..d1429371592f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -110,8 +110,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) - && (q->vars.prob < MAX_PROB / 5)) + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.prob < MAX_PROB / 5)) return false; /* If we have fewer than 2 mtu-sized packets, disable drop_early, @@ -209,7 +209,8 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) - q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + q->params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); @@ -247,7 +248,6 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) { - struct pie_sched_data *q = qdisc_priv(sch); int qlen = sch->qstats.backlog; /* current queue size in bytes */ @@ -294,9 +294,9 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ - if (qlen < QUEUE_THRESHOLD) + if (qlen < QUEUE_THRESHOLD) { q->vars.dq_count = DQCOUNT_INVALID; - else { + } else { q->vars.dq_count = 0; q->vars.dq_tstamp = psched_get_time(); } @@ -370,7 +370,7 @@ static void calculate_probability(struct Qdisc *sch) oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ - if (delta > (s32) (MAX_PROB / (100 / 2)) && + if (delta > (s32)(MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; @@ -405,7 +405,7 @@ static void calculate_probability(struct Qdisc *sch) * delay is 0 for 2 consecutive Tupdate periods. */ - if ((qdelay == 0) && (qdelay_old == 0) && update_prob) + if (qdelay == 0 && qdelay_old == 0 && update_prob) q->vars.prob = (q->vars.prob * 98) / 100; q->vars.qdelay = qdelay; @@ -419,8 +419,8 @@ static void calculate_probability(struct Qdisc *sch) */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.qdelay_old < q->params.target / 2) && - (q->vars.prob == 0) && - (q->vars.avg_dq_rate > 0)) + q->vars.prob == 0 && + q->vars.avg_dq_rate > 0) pie_vars_init(&q->vars); } @@ -437,7 +437,6 @@ static void pie_timer(struct timer_list *t) if (q->params.tupdate) mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); spin_unlock(root_lock); - } static int pie_init(struct Qdisc *sch, struct nlattr *opt, @@ -469,15 +468,16 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); - if (opts == NULL) + if (!opts) goto nla_put_failure; /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, - ((u32) PSCHED_TICKS2NS(q->params.target)) / + ((u32)PSCHED_TICKS2NS(q->params.target)) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || + nla_put_u32(skb, TCA_PIE_TUPDATE, + jiffies_to_usecs(q->params.tupdate)) || nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || @@ -489,7 +489,6 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_failure: nla_nest_cancel(skb, opts); return -1; - } static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) @@ -497,7 +496,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob, - .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / + .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, /* unscale and return dq_rate in bytes per sec */ .avg_dq_rate = q->vars.avg_dq_rate * @@ -514,8 +513,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { - struct sk_buff *skb; - skb = qdisc_dequeue_head(sch); + struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; @@ -527,6 +525,7 @@ static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) static void pie_reset(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); pie_vars_init(&q->vars); } @@ -534,6 +533,7 @@ static void pie_reset(struct Qdisc *sch) static void pie_destroy(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; del_timer_sync(&q->adapt_timer); } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 222e53d3d27a..f8af98621179 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch) tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) - qdisc_destroy(q->queues[prio]); + qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, @@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, extack); if (!queues[i]) { while (i > oldbands) - qdisc_destroy(queues[--i]); + qdisc_put(queues[--i]); return -ENOMEM; } } @@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_destroy(child); + qdisc_put(child); } for (i = oldbands; i < q->bands; i++) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bb1a9c11fc54..dc37c4ead439 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -526,7 +526,7 @@ set_change_agg: return 0; destroy_class: - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); return err; } @@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); - qdisc_destroy(cl->qdisc); + qdisc_put(cl->qdisc); kfree(cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 56c181c3feeb..3ce6c0a2c493 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch) del_timer_sync(&q->adapt_timer); red_offload(sch, false); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { @@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 7cbdad8419b7..bab506b01a32 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { @@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c new file mode 100644 index 000000000000..206e4dbed12f --- /dev/null +++ b/net/sched/sch_taprio.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_taprio.c Time Aware Priority Scheduler + * + * Authors: Vinicius Costa Gomes + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TAPRIO_ALL_GATES_OPEN -1 + +struct sched_entry { + struct list_head list; + + /* The instant that this entry "closes" and the next one + * should open, the qdisc will make some effort so that no + * packet leaves after this time. + */ + ktime_t close_time; + atomic_t budget; + int index; + u32 gate_mask; + u32 interval; + u8 command; +}; + +struct taprio_sched { + struct Qdisc **qdiscs; + struct Qdisc *root; + s64 base_time; + int clockid; + int picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ + size_t num_entries; + + /* Protects the update side of the RCU protected current_entry */ + spinlock_t current_entry_lock; + struct sched_entry __rcu *current_entry; + struct list_head entries; + ktime_t (*get_time)(void); + struct hrtimer advance_timer; +}; + +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct Qdisc *child; + int queue; + + queue = skb_get_queue_mapping(skb); + + child = q->qdiscs[queue]; + if (unlikely(!child)) + return qdisc_drop(skb, sch, to_free); + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return qdisc_enqueue(skb, child, to_free); +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + gate_mask = entry ? entry->gate_mask : -1; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + int prio; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + return NULL; + + return skb; + } + + return NULL; +} + +static inline int length_to_duration(struct taprio_sched *q, int len) +{ + return (len * q->picos_per_byte) / 1000; +} + +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + /* if there's no entry, it means that the schedule didn't + * start yet, so force all gates to be open, this is in + * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 + * "AdminGateSates" + */ + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + ktime_t guard; + int prio; + int len; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + continue; + + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(q->get_time(), + length_to_duration(q, len)); + + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + ktime_after(guard, entry->close_time)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + atomic_sub_return(len, &entry->budget) < 0) + return NULL; + + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; + } + + return NULL; +} + +static bool should_restart_cycle(const struct taprio_sched *q, + const struct sched_entry *entry) +{ + WARN_ON(!entry); + + return list_is_last(&entry->list, &q->entries); +} + +static enum hrtimer_restart advance_sched(struct hrtimer *timer) +{ + struct taprio_sched *q = container_of(timer, struct taprio_sched, + advance_timer); + struct sched_entry *entry, *next; + struct Qdisc *sch = q->root; + ktime_t close_time; + + spin_lock(&q->current_entry_lock); + entry = rcu_dereference_protected(q->current_entry, + lockdep_is_held(&q->current_entry_lock)); + + /* This is the case that it's the first time that the schedule + * runs, so it only happens once per schedule. The first entry + * is pre-calculated during the schedule initialization. + */ + if (unlikely(!entry)) { + next = list_first_entry(&q->entries, struct sched_entry, + list); + close_time = next->close_time; + goto first_run; + } + + if (should_restart_cycle(q, entry)) + next = list_first_entry(&q->entries, struct sched_entry, + list); + else + next = list_next_entry(entry, list); + + close_time = ktime_add_ns(entry->close_time, next->interval); + + next->close_time = close_time; + atomic_set(&next->budget, + (next->interval * 1000) / q->picos_per_byte); + +first_run: + rcu_assign_pointer(q->current_entry, next); + spin_unlock(&q->current_entry_lock); + + hrtimer_set_expires(&q->advance_timer, close_time); + + rcu_read_lock(); + __netif_schedule(sch); + rcu_read_unlock(); + + return HRTIMER_RESTART; +} + +static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, + [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, +}; + +static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { + [TCA_TAPRIO_ATTR_PRIOMAP] = { + .len = sizeof(struct tc_mqprio_qopt) + }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) + entry->command = nla_get_u8( + tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) + entry->gate_mask = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) + interval = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + return 0; +} + +static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_sched_entry(tb, entry, extack); +} + +/* Returns the number of entries in case of success */ +static int parse_sched_single_entry(struct nlattr *n, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; + struct sched_entry *entry; + bool found = false; + u32 index; + int err; + + err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX, + n, entry_list_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { + NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); + return -EINVAL; + } + + err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX, + tb_list[TCA_TAPRIO_SCHED_ENTRY], + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { + NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); + return -EINVAL; + } + + index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); + if (index >= q->num_entries) { + NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); + return -EINVAL; + } + + list_for_each_entry(entry, &q->entries, list) { + if (entry->index == index) { + found = true; + break; + } + } + + if (!found) { + NL_SET_ERR_MSG(extack, "Could not find entry"); + return -ENOENT; + } + + err = fill_sched_entry(tb_entry, entry, extack); + if (err < 0) + return err; + + return q->num_entries; +} + +static int parse_sched_list(struct nlattr *list, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list) + return -EINVAL; + + nla_for_each_nested(n, list, rem) { + struct sched_entry *entry; + + if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + err = parse_sched_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + return err; + } + + list_add_tail(&entry->list, &q->entries); + i++; + } + + q->num_entries = i; + + return i; +} + +/* Returns the number of entries in case of success */ +static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + int err = 0; + int clockid; + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) + return -EINVAL; + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) + q->base_time = nla_get_s64( + tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) + return -EINVAL; + + q->clockid = clockid; + } + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) + err = parse_sched_list( + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); + else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + err = parse_sched_single_entry( + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); + + /* parse_sched_* return the number of entries in the schedule, + * a schedule with zero entries is an error. + */ + if (err == 0) { + NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); + return -EINVAL; + } + + return err; +} + +static int taprio_parse_mqprio_opt(struct net_device *dev, + struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + int i, j; + + if (!qopt) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* taprio imposes that traffic classes map 1:n to tx queues */ + if (qopt->num_tc > dev->num_tx_queues) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); + return -EINVAL; + } + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) { + NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); + return -EINVAL; + } + } + } + + return 0; +} + +static ktime_t taprio_get_start_time(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *entry; + ktime_t now, base, cycle; + s64 n; + + base = ns_to_ktime(q->base_time); + cycle = 0; + + /* Calculate the cycle_time, by summing all the intervals. + */ + list_for_each_entry(entry, &q->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) + return base; + + now = q->get_time(); + + if (ktime_after(base, now)) + return base; + + /* Schedule the start time for the beginning of the next + * cycle. + */ + n = div64_s64(ktime_sub_ns(now, base), cycle); + + return ktime_add_ns(base, (n + 1) * cycle); +} + +static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *first; + unsigned long flags; + + spin_lock_irqsave(&q->current_entry_lock, flags); + + first = list_first_entry(&q->entries, struct sched_entry, + list); + + first->close_time = ktime_add_ns(start, first->interval); + atomic_set(&first->budget, + (first->interval * 1000) / q->picos_per_byte); + rcu_assign_pointer(q->current_entry, NULL); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + + hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); +} + +static int taprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt *mqprio = NULL; + struct ethtool_link_ksettings ecmd; + int i, err, size; + s64 link_speed; + ktime_t start; + + err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, + taprio_policy, extack); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) + mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); + + err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (err < 0) + return err; + + /* A schedule with less than one entry is an error */ + size = parse_taprio_opt(tb, q, extack); + if (size < 0) + return size; + + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + return -ENOTSUPP; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + link_speed * 1000 * 1000); + + start = taprio_get_start_time(sch); + if (!start) + return 0; + + taprio_start_sched(sch, start); + + return 0; +} + +static void taprio_destroy(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry, *n; + unsigned int i; + + hrtimer_cancel(&q->advance_timer); + + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + qdisc_put(q->qdiscs[i]); + + kfree(q->qdiscs); + } + q->qdiscs = NULL; + + netdev_set_num_tc(dev, 0); + + list_for_each_entry_safe(entry, n, &q->entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int taprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + INIT_LIST_HEAD(&q->entries); + spin_lock_init(&q->current_entry_lock); + + /* We may overwrite the configuration later */ + hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + + q->root = sch; + + /* We only support static clockids. Use an invalid value as default + * and get the valid one on taprio_change(). + */ + q->clockid = -1; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdisc, attachment can't fail */ + q->qdiscs = kcalloc(dev->num_tx_queues, + sizeof(q->qdiscs[0]), + GFP_KERNEL); + + if (!q->qdiscs) + return -ENOMEM; + + if (!opt) + return -EINVAL; + + return taprio_change(sch, opt, extack); +} + +static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + + return netdev_get_tx_queue(dev, ntx); +} + +static int taprio_graft(struct Qdisc *sch, unsigned long cl, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + + if (new) + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int dump_entry(struct sk_buff *msg, + const struct sched_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, + entry->gate_mask)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, + entry->interval)) + goto nla_put_failure; + + return nla_nest_end(msg, item); + +nla_put_failure: + nla_nest_cancel(msg, item); + return -1; +} + +static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt opt = { 0 }; + struct nlattr *nest, *entry_list; + struct sched_entry *entry; + unsigned int i; + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + return -ENOSPC; + + if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) + goto options_error; + + if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + q->base_time, TCA_TAPRIO_PAD)) + goto options_error; + + if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + goto options_error; + + entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto options_error; + + list_for_each_entry(entry, &q->entries, list) { + if (dump_entry(skb, entry) < 0) + goto options_error; + } + + nla_nest_end(skb, entry_list); + + return nla_nest_end(skb, nest); + +options_error: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long taprio_find(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!taprio_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + + return 0; +} + +static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + return -1; + return 0; +} + +static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + +static const struct Qdisc_class_ops taprio_class_ops = { + .graft = taprio_graft, + .leaf = taprio_leaf, + .find = taprio_find, + .walk = taprio_walk, + .dump = taprio_dump_class, + .dump_stats = taprio_dump_class_stats, + .select_queue = taprio_select_queue, +}; + +static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { + .cl_ops = &taprio_class_ops, + .id = "taprio", + .priv_size = sizeof(struct taprio_sched), + .init = taprio_init, + .destroy = taprio_destroy, + .peek = taprio_peek, + .dequeue = taprio_dequeue, + .enqueue = taprio_enqueue, + .dump = taprio_dump, + .owner = THIS_MODULE, +}; + +static int __init taprio_module_init(void) +{ + return register_qdisc(&taprio_qdisc_ops); +} + +static void __exit taprio_module_exit(void) +{ + unregister_qdisc(&taprio_qdisc_ops); +} + +module_init(taprio_module_init); +module_exit(taprio_module_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 6f74a426f159..942dcca09cf2 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, nb = 0; while (segs) { nskb = segs->next; - segs->next = NULL; + skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; ret = qdisc_enqueue(segs, q->qdisc, to_free); @@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); q->qdisc = child; } q->limit = qopt->limit; @@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - qdisc_destroy(q->qdisc); + qdisc_put(q->qdisc); } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 297d9cf960b9..6a28b96e779e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -499,8 +499,9 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer) { - struct list_head *pos; - struct sctp_transport *transport; + struct sctp_transport *transport; + struct list_head *pos; + struct sctp_chunk *ch; pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, &peer->ipaddr.sa); @@ -564,7 +565,6 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, */ if (!list_empty(&peer->transmitted)) { struct sctp_transport *active = asoc->peer.active_path; - struct sctp_chunk *ch; /* Reset the transport of each chunk on this list */ list_for_each_entry(ch, &peer->transmitted, @@ -586,6 +586,10 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, sctp_transport_hold(active); } + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) + if (ch->transport == peer) + ch->transport = NULL; + asoc->peer.transport_count--; sctp_transport_free(peer); @@ -1450,7 +1454,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst)); + sctp_transport_update_pmtu(t, + atomic_read(&t->mtu_info)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/input.c b/net/sctp/input.c index 9bbc5f92c941..5c36a99882ed 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -395,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; if (sock_owned_by_user(sk)) { + atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; diff --git a/net/sctp/output.c b/net/sctp/output.c index 7f849b01ec8e..67939ad99c01 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_assoc_sync_pmtu(asoc); } + if (asoc->pmtu_pending) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); + asoc->pmtu_pending = 0; + } + /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d74d00b29942..c37e1c2dec9d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); - sctp_sched_set_sched(asoc, SCTP_SS_FCFS); + sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT); } /* Free the outqueue structure and any related pending chunks. @@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); } - msg_len -= SCTP_DATA_SNDSIZE(chk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); + msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); if (msg_len <= 0) break; } @@ -421,9 +419,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } - msg_len -= SCTP_DATA_SNDSIZE(chk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); + msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) break; @@ -1048,7 +1044,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, if (!ctx->packet || !ctx->packet->has_cookie_echo) return; - /* fallthru */ + /* fall through */ case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e948db29ab53..9b277bd36d1a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f73e9d38d5ba..739f3e50120d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -83,7 +83,7 @@ #include /* Forward declarations for internal helper functions. */ -static int sctp_writeable(struct sock *sk); +static bool sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len); @@ -119,25 +119,10 @@ static void sctp_enter_memory_pressure(struct sock *sk) /* Get the sndbuf space available at the time on the association. */ static inline int sctp_wspace(struct sctp_association *asoc) { - int amt; + struct sock *sk = asoc->base.sk; - if (asoc->ep->sndbuf_policy) - amt = asoc->sndbuf_used; - else - amt = sk_wmem_alloc_get(asoc->base.sk); - - if (amt >= asoc->base.sk->sk_sndbuf) { - if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK) - amt = 0; - else { - amt = sk_stream_wspace(asoc->base.sk); - if (amt < 0) - amt = 0; - } - } else { - amt = asoc->base.sk->sk_sndbuf - amt; - } - return amt; + return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used + : sk_stream_wspace(sk); } /* Increment the used sndbuf space count of the corresponding association by @@ -166,12 +151,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) /* Save the chunk pointer in skb for sctp_wfree to use later. */ skb_shinfo(chunk->skb)->destructor_arg = chunk; - asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); - sk->sk_wmem_queued += chunk->skb->truesize; + asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk); + sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk); sk_mem_charge(sk, chunk->skb->truesize); } @@ -271,11 +253,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) spin_lock_bh(&sctp_assocs_id_lock); asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); + if (asoc && (asoc->base.sk != sk || asoc->base.dead)) + asoc = NULL; spin_unlock_bh(&sctp_assocs_id_lock); - if (!asoc || (asoc->base.sk != sk) || asoc->base.dead) - return NULL; - return asoc; } @@ -1928,10 +1909,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, asoc->pmtu_pending = 0; } - if (sctp_wspace(asoc) < msg_len) + if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - if (!sctp_wspace(asoc)) { + if (sctp_wspace(asoc) <= 0) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) @@ -1946,8 +1927,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sp->strm_interleave) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); - if (err) + if (err) { + err = -ESRCH; goto err; + } } else { wait_connect = true; } @@ -7100,14 +7083,15 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; - if (policy == SCTP_PR_SCTP_NONE) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -7159,7 +7143,8 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); @@ -7175,7 +7160,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, goto out; } - if (policy == SCTP_PR_SCTP_NONE) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -8460,17 +8445,11 @@ static void sctp_wfree(struct sk_buff *skb) struct sctp_association *asoc = chunk->asoc; struct sock *sk = asoc->base.sk; - asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - - WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc)); - - /* - * This undoes what is done via sctp_set_owner_w and sk_mem_charge - */ - sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); + sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk); + asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk); + WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), + &sk->sk_wmem_alloc)); if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; @@ -8544,7 +8523,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, goto do_error; if (signal_pending(current)) goto do_interrupted; - if (msg_len <= sctp_wspace(asoc)) + if ((int)msg_len <= sctp_wspace(asoc)) break; /* Let another process have a go. Since we are going @@ -8619,14 +8598,9 @@ void sctp_write_space(struct sock *sk) * UDP-style sockets or TCP-style sockets, this code should work. * - Daisy */ -static int sctp_writeable(struct sock *sk) +static bool sctp_writeable(struct sock *sk) { - int amt = 0; - - amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk); - if (amt < 0) - amt = 0; - return amt; + return sk->sk_sndbuf > sk->sk_wmem_queued; } /* Wait for an association to go into ESTABLISHED state. If timeout is 0, diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0b427100b0d4..331cc734e3db 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -459,7 +459,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul * element in the queue, then count it towards * possible PD. */ - if (pos == ulpq->reasm.next) { + if (skb_queue_is_first(&ulpq->reasm, pos)) { pd_first = pos; pd_last = pos; pd_len = pos->len; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 015231789ed2..80e2119f1c70 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1543,7 +1543,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 52241d679cc9..89c3a8c7859a 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -286,7 +286,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, */ krflags = MSG_PEEK | MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { @@ -325,7 +325,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e871368500e3..18daebcef181 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -122,22 +122,17 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } -/* Unregister connection and trigger lgr freeing if applicable +/* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - int reduced = 0; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { - reduced = 1; __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (!reduced || lgr->conns_num) - return; - smc_lgr_schedule_free_work(lgr); } /* Send delete link, either as client to request the initiation @@ -291,7 +286,8 @@ out: return rc; } -static void smc_buf_unuse(struct smc_connection *conn) +static void smc_buf_unuse(struct smc_connection *conn, + struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; @@ -301,8 +297,6 @@ static void smc_buf_unuse(struct smc_connection *conn) conn->rmb_desc->used = 0; } else { /* buf registration failed, reuse not possible */ - struct smc_link_group *lgr = conn->lgr; - write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); @@ -315,16 +309,21 @@ static void smc_buf_unuse(struct smc_connection *conn) /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { - if (!conn->lgr) + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) return; - if (conn->lgr->is_smcd) { + if (lgr->is_smcd) { smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn); + smc_lgr_unregister_conn(conn); /* unsets conn->lgr */ + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + + if (!lgr->conns_num) + smc_lgr_schedule_free_work(lgr); } static void smc_link_clear(struct smc_link *lnk) diff --git a/net/socket.c b/net/socket.c index 01f3f8f32d6f..593826e11a53 100644 --- a/net/socket.c +++ b/net/socket.c @@ -635,7 +635,7 @@ EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -648,7 +648,7 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } @@ -823,7 +823,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; - iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); set_fs(KERNEL_DS); result = sock_recvmsg(sock, msg, flags); set_fs(oldfs); @@ -1475,7 +1475,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); - if (err >= 0) { + if (!err) { err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); @@ -2342,7 +2342,7 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, */ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec *timeout) + unsigned int flags, struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; @@ -2407,8 +2407,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (timeout) { ktime_get_ts64(&timeout64); - *timeout = timespec64_to_timespec( - timespec64_sub(end_time, timeout64)); + *timeout = timespec64_sub(end_time, timeout64); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; @@ -2454,10 +2453,10 @@ out_put: static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, - struct timespec __user *timeout) + struct __kernel_timespec __user *timeout) { int datagrams; - struct timespec timeout_sys; + struct timespec64 timeout_sys; if (flags & MSG_CMSG_COMPAT) return -EINVAL; @@ -2465,13 +2464,12 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, if (!timeout) return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); - if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) + if (get_timespec64(&timeout_sys, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); - if (datagrams > 0 && - copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys))) + if (datagrams > 0 && put_timespec64(&timeout_sys, timeout)) datagrams = -EFAULT; return datagrams; @@ -2479,7 +2477,7 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, - struct timespec __user *, timeout) + struct __kernel_timespec __user *, timeout) { return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); } @@ -2603,7 +2601,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) break; case SYS_RECVMMSG: err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], - a[3], (struct timespec __user *)a[4]); + a[3], (struct __kernel_timespec __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, @@ -2875,9 +2873,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) + (void __user *)&rxnfc->fs.ring_cookie)) + return -EFAULT; + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + if (put_user(rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + } else if (copy_in_user(&rxnfc->rule_cnt, + &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) return -EFAULT; } diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig index 6cff3f6d0c3a..94da19a2a220 100644 --- a/net/strparser/Kconfig +++ b/net/strparser/Kconfig @@ -1,4 +1,2 @@ - config STREAM_PARSER - tristate - default n + def_bool n diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 305ecea92170..ad8ead738981 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -30,10 +30,9 @@ struct rpc_cred_cache { static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; -static DEFINE_SPINLOCK(rpc_authflavor_lock); -static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { - &authnull_ops, /* AUTH_NULL */ - &authunix_ops, /* AUTH_UNIX */ +static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, + [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, NULL, /* others can be loadable modules */ }; @@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) { int rpcauth_register(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == NULL) { - auth_flavors[flavor] = ops; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); + if (old == NULL || old == ops) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == ops) { - auth_flavors[flavor] = NULL; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); + if (old == ops || old == NULL) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); +static const struct rpc_authops * +rpcauth_get_authops(rpc_authflavor_t flavor) +{ + const struct rpc_authops *ops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) { + rcu_read_unlock(); + request_module("rpc-auth-%u", flavor); + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) + goto out; + } + if (!try_module_get(ops->owner)) + ops = NULL; +out: + rcu_read_unlock(); + return ops; +} + +static void +rpcauth_put_authops(const struct rpc_authops *ops) +{ + module_put(ops->owner); +} + /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor @@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { - const struct rpc_authops *ops; + const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; - ops = auth_flavors[flavor]; - if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); + if (!ops) return RPC_AUTH_MAXFLAVOR; - } - spin_unlock(&rpc_authflavor_lock); - pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); - module_put(ops->owner); + rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); @@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) const struct rpc_authops *ops; int result; - if (flavor >= RPC_AUTH_MAXFLAVOR) - return -EINVAL; - - ops = auth_flavors[flavor]; + ops = rpcauth_get_authops(flavor); if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); return -ENOENT; - } - spin_unlock(&rpc_authflavor_lock); result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); - module_put(ops->owner); + rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); @@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); int rpcauth_list_flavors(rpc_authflavor_t *array, int size) { - rpc_authflavor_t flavor; - int result = 0; + const struct rpc_authops *ops; + rpc_authflavor_t flavor, pseudos[4]; + int i, len, result = 0; - spin_lock(&rpc_authflavor_lock); + rcu_read_lock(); for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { - const struct rpc_authops *ops = auth_flavors[flavor]; - rpc_authflavor_t pseudos[4]; - int i, len; - + ops = rcu_dereference(auth_flavors[flavor]); if (result >= size) { result = -ENOMEM; break; @@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size) array[result++] = pseudos[i]; } } - spin_unlock(&rpc_authflavor_lock); + rcu_read_unlock(); dprintk("RPC: %s returns %d\n", __func__, result); return result; @@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - struct rpc_auth *auth; + struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; - u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); + u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); - auth = ERR_PTR(-EINVAL); - if (flavor >= RPC_AUTH_MAXFLAVOR) + ops = rpcauth_get_authops(flavor); + if (ops == NULL) goto out; - if ((ops = auth_flavors[flavor]) == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); - goto out; - } - spin_unlock(&rpc_authflavor_lock); auth = ops->create(args, clnt); - module_put(ops->owner); + + rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) @@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { - if (!atomic_dec_and_test(&auth->au_count)) + if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); -static void +/* + * On success, the caller is responsible for freeing the reference + * held by the hashtable + */ +static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { + if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; hlist_del_rcu(&cred->cr_hash); - smp_mb__before_atomic(); - clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + return true; } -static int +static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; - int ret; + bool ret; + if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); - ret = atomic_read(&cred->cr_count) == 0; - if (ret) - rpcauth_unhash_cred_locked(cred); + ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } @@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head) } } +static void +rpcauth_lru_add_locked(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + number_cred_unused++; + list_add_tail(&cred->cr_lru, &cred_unused); +} + +static void +rpcauth_lru_add(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_add_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + +static void +rpcauth_lru_remove_locked(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + number_cred_unused--; + list_del_init(&cred->cr_lru); +} + +static void +rpcauth_lru_remove(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_remove_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. @@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache) head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); - get_rpccred(cred); - if (!list_empty(&cred->cr_lru)) { - list_del(&cred->cr_lru); - number_cred_unused--; - } - list_add_tail(&cred->cr_lru, &free); rpcauth_unhash_cred_locked(cred); + /* Note: We now hold a reference to cred */ + rpcauth_lru_remove_locked(cred); + list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); @@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { - spinlock_t *cache_lock; struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; @@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) if (nr_to_scan-- == 0) break; + if (refcount_read(&cred->cr_count) > 1) { + rpcauth_lru_remove_locked(cred); + continue; + } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - freed = SHRINK_STOP; - break; - } - - list_del_init(&cred->cr_lru); - number_cred_unused--; - freed++; - if (atomic_read(&cred->cr_count) != 0) + if (!time_in_range(cred->cr_expire, expired, jiffies)) + continue; + if (!rpcauth_unhash_cred(cred)) continue; - cache_lock = &cred->cr_auth->au_credcache->lock; - spin_lock(cache_lock); - if (atomic_read(&cred->cr_count) == 0) { - get_rpccred(cred); - list_add_tail(&cred->cr_lru, free); - rpcauth_unhash_cred_locked(cred); - } - spin_unlock(cache_lock); + rpcauth_lru_remove_locked(cred); + freed++; + list_add_tail(&cred->cr_lru, free); } - return freed; + return freed ? freed : SHRINK_STOP; } static unsigned long @@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && - !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) - cred = entry; + if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || + refcount_read(&entry->cr_count) == 0) + continue; + cred = entry; break; } - spin_lock(&cache->lock); - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { - spin_unlock(&cache->lock); - continue; - } cred = get_rpccred(entry); - spin_unlock(&cache->lock); - break; + if (cred) + break; } rcu_read_unlock(); @@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); - break; + if (cred) + break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); @@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); - atomic_set(&cred->cr_count, 1); + refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; @@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; - /* Fast path for unhashed credentials */ - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { - if (atomic_dec_and_test(&cred->cr_count)) - cred->cr_ops->crdestroy(cred); - return; - } - - if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) - return; - if (!list_empty(&cred->cr_lru)) { - number_cred_unused--; - list_del_init(&cred->cr_lru); - } - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { - cred->cr_expire = jiffies; - list_add_tail(&cred->cr_lru, &cred_unused); - number_cred_unused++; - goto out_nodestroy; - } - if (!rpcauth_unhash_cred(cred)) { - /* We were hashed and someone looked us up... */ - goto out_nodestroy; - } + rcu_read_lock(); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; + if (refcount_read(&cred->cr_count) != 1 || + !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + goto out; + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { + cred->cr_expire = jiffies; + rpcauth_lru_add(cred); + /* Race breaker */ + if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) + rpcauth_lru_remove(cred); + } else if (rpcauth_unhash_cred(cred)) { + rpcauth_lru_remove(cred); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; } - spin_unlock(&rpc_credcache_lock); - cred->cr_ops->crdestroy(cred); +out: + rcu_read_unlock(); return; -out_nodestroy: - spin_unlock(&rpc_credcache_lock); +destroy: + rcu_read_unlock(); + cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); @@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); } +bool +rpcauth_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!cred || !cred->cr_ops->crneed_reencode) + return false; + return cred->cr_ops->crneed_reencode(task); +} + int rpcauth_refreshcred(struct rpc_task *task) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index f1df9837f1ac..d8831b988b1e 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = { static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static bool generic_key_to_expire(struct rpc_cred *cred) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 21c0aa0a0d1d..30f970cdc7f6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; - atomic_set(&auth->au_count, 1); + refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); @@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, if (strcmp(gss_auth->target_name, args->target_name)) continue; } - if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count)) + if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } @@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, return decode(rqstp, &xdr, obj); } +static bool +gss_seq_is_newer(u32 new, u32 old) +{ + return (s32)(new - old) > 0; +} + +static bool +gss_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 win, seq_xmit; + bool ret = true; + + if (!ctx) + return true; + + if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + goto out; + + seq_xmit = READ_ONCE(ctx->gc_seq_xmit); + while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + u32 tmp = seq_xmit; + + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + if (seq_xmit == tmp) { + ret = false; + goto out; + } + } + + win = ctx->gc_win; + if (win > 0) + ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); +out: + gss_put_ctx(ctx); + return ret; +} + static int gss_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) @@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = { .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, + .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 0220e1ca5280..4f43383971ba 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -53,7 +53,7 @@ u32 krb5_encrypt( - struct crypto_skcipher *tfm, + struct crypto_sync_skcipher *tfm, void * iv, void * in, void * out, @@ -62,24 +62,24 @@ krb5_encrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_skcipher_blocksize(tfm) != 0) + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", - crypto_skcipher_ivsize(tfm)); + crypto_sync_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm)); + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, length, local_iv); @@ -92,7 +92,7 @@ out: u32 krb5_decrypt( - struct crypto_skcipher *tfm, + struct crypto_sync_skcipher *tfm, void * iv, void * in, void * out, @@ -101,23 +101,23 @@ krb5_decrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_skcipher_blocksize(tfm) != 0) + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_skcipher_ivsize(tfm)); + crypto_sync_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm)); + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, length, local_iv); @@ -466,7 +466,8 @@ encryptor(struct scatterlist *sg, void *data) { struct encryptor_desc *desc = data; struct xdr_buf *outbuf = desc->outbuf; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); + struct crypto_sync_skcipher *tfm = + crypto_sync_skcipher_reqtfm(desc->req); struct page *in_page; int thislen = desc->fraglen + sg->length; int fraglen, ret; @@ -492,7 +493,7 @@ encryptor(struct scatterlist *sg, void *data) desc->fraglen += sg->length; desc->pos += sg->length; - fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); + fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -526,16 +527,16 @@ encryptor(struct scatterlist *sg, void *data) } int -gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, +gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, int offset, struct page **pages) { int ret; struct encryptor_desc desc; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); + BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); @@ -567,7 +568,8 @@ decryptor(struct scatterlist *sg, void *data) { struct decryptor_desc *desc = data; int thislen = desc->fraglen + sg->length; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); + struct crypto_sync_skcipher *tfm = + crypto_sync_skcipher_reqtfm(desc->req); int fraglen, ret; /* Worst case is 4 fragments: head, end of page 1, start @@ -578,7 +580,7 @@ decryptor(struct scatterlist *sg, void *data) desc->fragno++; desc->fraglen += sg->length; - fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); + fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -608,17 +610,17 @@ decryptor(struct scatterlist *sg, void *data) } int -gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, +gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, int offset) { int ret; struct decryptor_desc desc; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); + BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); @@ -672,12 +674,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) } static u32 -gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, +gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf, u32 offset, u8 *iv, struct page **pages, int encrypt) { u32 ret; struct scatterlist sg[1]; - SKCIPHER_REQUEST_ON_STACK(req, cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, cipher); u8 *data; struct page **save_pages; u32 len = buf->len - offset; @@ -706,7 +708,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, sg_init_one(sg, data, len); - skcipher_request_set_tfm(req, cipher); + skcipher_request_set_sync_tfm(req, cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv); @@ -735,7 +737,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_netobj hmac; u8 *cksumkey; u8 *ecptr; - struct crypto_skcipher *cipher, *aux_cipher; + struct crypto_sync_skcipher *cipher, *aux_cipher; int blocksize; struct page **save_pages; int nblocks, nbytes; @@ -754,7 +756,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, cksumkey = kctx->acceptor_integ; usage = KG_USAGE_ACCEPTOR_SEAL; } - blocksize = crypto_skcipher_blocksize(cipher); + blocksize = crypto_sync_skcipher_blocksize(cipher); /* hide the gss token header and insert the confounder */ offset += GSS_KRB5_TOK_HDR_LEN; @@ -807,7 +809,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { - SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; desc.fragno = 0; @@ -816,7 +818,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, desc.outbuf = buf; desc.req = req; - skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_sync_tfm(req, aux_cipher); skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.infrags, 4); @@ -855,7 +857,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct xdr_buf subbuf; u32 ret = 0; u8 *cksum_key; - struct crypto_skcipher *cipher, *aux_cipher; + struct crypto_sync_skcipher *cipher, *aux_cipher; struct xdr_netobj our_hmac_obj; u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; @@ -874,7 +876,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, cksum_key = kctx->initiator_integ; usage = KG_USAGE_INITIATOR_SEAL; } - blocksize = crypto_skcipher_blocksize(cipher); + blocksize = crypto_sync_skcipher_blocksize(cipher); /* create a segment skipping the header and leaving out the checksum */ @@ -891,13 +893,13 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { - SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); desc.fragno = 0; desc.fraglen = 0; desc.req = req; - skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_sync_tfm(req, aux_cipher); skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.frags, 4); @@ -946,7 +948,8 @@ out_err: * Set the key of the given cipher. */ int -krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, +krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, + struct crypto_sync_skcipher *cipher, unsigned char *cksum) { struct crypto_shash *hmac; @@ -994,7 +997,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, if (err) goto out_err; - err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); + err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); if (err) goto out_err; @@ -1012,7 +1015,8 @@ out_err: * Set the key of cipher kctx->enc. */ int -krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, +krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, + struct crypto_sync_skcipher *cipher, s32 seqnum) { struct crypto_shash *hmac; @@ -1069,7 +1073,8 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, if (err) goto out_err; - err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); + err = crypto_sync_skcipher_setkey(cipher, Kcrypt, + kctx->gk5e->keylength); if (err) goto out_err; diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index f7fe2d2b851f..550fdf18d3b3 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, size_t blocksize, keybytes, keylength, n; unsigned char *inblockdata, *outblockdata, *rawkey; struct xdr_netobj inblock, outblock; - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; u32 ret = EINVAL; blocksize = gk5e->blocksize; @@ -157,11 +157,10 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, if ((inkey->len != keylength) || (outkey->len != keylength)) goto err_return; - cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) goto err_return; - if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len)) + if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len)) goto err_return; /* allocate and set up buffers */ @@ -238,7 +237,7 @@ err_free_in: memset(inblockdata, 0, blocksize); kfree(inblockdata); err_free_cipher: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); err_return: return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 7bb2514aadd9..eab71fc7af3e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -218,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) static inline const void * get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_skcipher **res) + struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) { struct xdr_netobj key; int alg; @@ -246,15 +246,14 @@ get_key(const void *p, const void *end, if (IS_ERR(p)) goto out_err; - *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + *res = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(*res)) { printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " "crypto algorithm %s\n", ctx->gk5e->encrypt_name); *res = NULL; goto out_err_free_key; } - if (crypto_skcipher_setkey(*res, key.data, key.len)) { + if (crypto_sync_skcipher_setkey(*res, key.data, key.len)) { printk(KERN_WARNING "gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", ctx->gk5e->encrypt_name); goto out_err_free_tfm; @@ -264,7 +263,7 @@ get_key(const void *p, const void *end, return p; out_err_free_tfm: - crypto_free_skcipher(*res); + crypto_free_sync_skcipher(*res); out_err_free_key: kfree(key.data); p = ERR_PTR(-EINVAL); @@ -275,6 +274,7 @@ out_err: static int gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) { + u32 seq_send; int tmp; p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); @@ -316,9 +316,10 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); if (IS_ERR(p)) goto out_err; + atomic_set(&ctx->seq_send, seq_send); p = simple_get_netobj(p, end, &ctx->mech_used); if (IS_ERR(p)) goto out_err; @@ -336,30 +337,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) return 0; out_err_free_key2: - crypto_free_skcipher(ctx->seq); + crypto_free_sync_skcipher(ctx->seq); out_err_free_key1: - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); out_err_free_mech: kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } -static struct crypto_skcipher * +static struct crypto_sync_skcipher * context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) { - struct crypto_skcipher *cp; + struct crypto_sync_skcipher *cp; - cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC); + cp = crypto_alloc_sync_skcipher(cname, 0, 0); if (IS_ERR(cp)) { dprintk("gss_kerberos_mech: unable to initialize " "crypto algorithm %s\n", cname); return NULL; } - if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { + if (crypto_sync_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { dprintk("gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", cname); - crypto_free_skcipher(cp); + crypto_free_sync_skcipher(cp); return NULL; } return cp; @@ -413,9 +414,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_enc: - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); out_free_seq: - crypto_free_skcipher(ctx->seq); + crypto_free_sync_skcipher(ctx->seq); out_err: return -EINVAL; } @@ -469,17 +470,15 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) /* * allocate hash, and skciphers for data and seqnum encryption */ - ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(ctx->enc)) { err = PTR_ERR(ctx->enc); goto out_err_free_hmac; } - ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(ctx->seq)) { - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); err = PTR_ERR(ctx->seq); goto out_err_free_hmac; } @@ -591,7 +590,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) context_v2_alloc_cipher(ctx, "cbc(aes)", ctx->acceptor_seal); if (ctx->acceptor_enc_aux == NULL) { - crypto_free_skcipher(ctx->initiator_enc_aux); + crypto_free_sync_skcipher(ctx->initiator_enc_aux); goto out_free_acceptor_enc; } } @@ -599,9 +598,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_acceptor_enc: - crypto_free_skcipher(ctx->acceptor_enc); + crypto_free_sync_skcipher(ctx->acceptor_enc); out_free_initiator_enc: - crypto_free_skcipher(ctx->initiator_enc); + crypto_free_sync_skcipher(ctx->initiator_enc); out_err: return -EINVAL; } @@ -610,6 +609,7 @@ static int gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, gfp_t gfp_mask) { + u64 seq_send64; int keylen; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); @@ -620,14 +620,15 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); + p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64)); if (IS_ERR(p)) goto out_err; + atomic64_set(&ctx->seq_send64, seq_send64); /* set seq_send for use by "older" enctypes */ - ctx->seq_send = ctx->seq_send64; - if (ctx->seq_send64 != ctx->seq_send) { - dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, - (unsigned long)ctx->seq_send64, ctx->seq_send); + atomic_set(&ctx->seq_send, seq_send64); + if (seq_send64 != atomic_read(&ctx->seq_send)) { + dprintk("%s: seq_send64 %llx, seq_send %x overflow?\n", __func__, + seq_send64, atomic_read(&ctx->seq_send)); p = ERR_PTR(-EINVAL); goto out_err; } @@ -713,12 +714,12 @@ static void gss_delete_sec_context_kerberos(void *internal_ctx) { struct krb5_ctx *kctx = internal_ctx; - crypto_free_skcipher(kctx->seq); - crypto_free_skcipher(kctx->enc); - crypto_free_skcipher(kctx->acceptor_enc); - crypto_free_skcipher(kctx->initiator_enc); - crypto_free_skcipher(kctx->acceptor_enc_aux); - crypto_free_skcipher(kctx->initiator_enc_aux); + crypto_free_sync_skcipher(kctx->seq); + crypto_free_sync_skcipher(kctx->enc); + crypto_free_sync_skcipher(kctx->acceptor_enc); + crypto_free_sync_skcipher(kctx->initiator_enc); + crypto_free_sync_skcipher(kctx->acceptor_enc_aux); + crypto_free_sync_skcipher(kctx->initiator_enc_aux); kfree(kctx->mech_used.data); kfree(kctx); } diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index eaad9bc7a0bd..48fe4a591b54 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -63,13 +63,12 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -DEFINE_SPINLOCK(krb5_seq_lock); - static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { @@ -154,9 +153,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = atomic_fetch_inc(&ctx->seq_send); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -174,7 +171,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, .data = cksumdata}; void *krb5_hdr; s32 now; - u64 seq_send; u8 *cksumkey; unsigned int cksum_usage; __be64 seq_send_be64; @@ -185,11 +181,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send64++; - spin_unlock(&krb5_seq_lock); - - seq_send_be64 = cpu_to_be64(seq_send); + seq_send_be64 = cpu_to_be64(atomic64_fetch_inc(&ctx->seq_send64)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index c8b9082f4a9d..fb6656295204 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -43,13 +43,12 @@ static s32 krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, unsigned char *cksum, unsigned char *buf) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -68,12 +67,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, code = krb5_encrypt(cipher, cksum, plain, buf, 8); out: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); return code; } s32 krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_skcipher *key, + struct crypto_sync_skcipher *key, int direction, u32 seqnum, unsigned char *cksum, unsigned char *buf) @@ -101,13 +100,12 @@ static s32 krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, unsigned char *buf, int *direction, s32 *seqnum) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -130,7 +128,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, *seqnum = ((plain[0] << 24) | (plain[1] << 16) | (plain[2] << 8) | (plain[3])); out: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); return code; } @@ -142,7 +140,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx, { s32 code; unsigned char plain[8]; - struct crypto_skcipher *key = kctx->seq; + struct crypto_sync_skcipher *key = kctx->seq; dprintk("RPC: krb5_get_seq_num:\n"); diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 39a2e672900b..5cdde6cb703a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, now = get_seconds(); - blocksize = crypto_skcipher_blocksize(kctx->enc); + blocksize = crypto_sync_skcipher_blocksize(kctx->enc); gss_krb5_add_padding(buf, offset, blocksize); BUG_ON((buf->len - offset) % blocksize); plainlen = conflen + buf->len - offset; @@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = kctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = atomic_fetch_inc(&kctx->seq_send); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -239,10 +237,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, return GSS_S_FAILURE; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; int err; - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, + 0, 0); if (IS_ERR(cipher)) return GSS_S_FAILURE; @@ -250,7 +248,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, err = gss_encrypt_xdr_buf(cipher, buf, offset + headlen - conflen, pages); - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); if (err) return GSS_S_FAILURE; } else { @@ -327,18 +325,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) return GSS_S_BAD_SIG; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; int err; - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, + 0, 0); if (IS_ERR(cipher)) return GSS_S_FAILURE; krb5_rc4_setup_enc_key(kctx, cipher, seqnum); err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); if (err) return GSS_S_DEFECTIVE_TOKEN; } else { @@ -371,7 +369,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* Copy the data back to the right position. XXX: Would probably be * better to copy and encrypt at the same time. */ - blocksize = crypto_skcipher_blocksize(kctx->enc); + blocksize = crypto_sync_skcipher_blocksize(kctx->enc); data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + conflen; orig_start = buf->head[0].iov_base + offset; @@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - spin_lock(&krb5_seq_lock); - *be64ptr = cpu_to_be64(kctx->seq_send64++); - spin_unlock(&krb5_seq_lock); + *be64ptr = cpu_to_be64(atomic64_fetch_inc(&kctx->seq_send64)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 5fec3abbe19b..16ac0f4cb7d8 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm) if (status) return status; spin_lock(®istered_mechs_lock); - list_add(&gm->gm_list, ®istered_mechs); + list_add_rcu(&gm->gm_list, ®istered_mechs); spin_unlock(®istered_mechs_lock); dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); return 0; @@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register); void gss_mech_unregister(struct gss_api_mech *gm) { spin_lock(®istered_mechs_lock); - list_del(&gm->gm_list); + list_del_rcu(&gm->gm_list); spin_unlock(®istered_mechs_lock); dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); gss_mech_free(gm); @@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name) { struct gss_api_mech *pos, *gm = NULL; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (0 == strcmp(name, pos->gm_name)) { if (try_module_get(pos->gm_owner)) gm = pos; break; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) dprintk("RPC: %s(%s)\n", __func__, buf); request_module("rpc-auth-gss-%s", buf); - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (obj->len == pos->gm_oid.len) { if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { if (try_module_get(pos->gm_owner)) @@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) } } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) { struct gss_api_mech *gm = NULL, *pos; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; if (try_module_get(pos->gm_owner)) gm = pos; break; } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) struct gss_api_mech *pos = NULL; int j, i = 0; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { for (j = 0; j < pos->gm_pf_num; j++) { if (i >= size) { spin_unlock(®istered_mechs_lock); @@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return i; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 444380f968f1..006062ad5f58 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 860f2a1bbb67..1ece4bc3eb8d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -76,6 +76,7 @@ struct rsi { struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; + struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); @@ -89,13 +90,21 @@ static void rsi_free(struct rsi *rsii) kfree(rsii->out_token.data); } -static void rsi_put(struct kref *ref) +static void rsi_free_rcu(struct rcu_head *head) { - struct rsi *rsii = container_of(ref, struct rsi, h.ref); + struct rsi *rsii = container_of(head, struct rsi, rcu_head); + rsi_free(rsii); kfree(rsii); } +static void rsi_put(struct kref *ref) +{ + struct rsi *rsii = container_of(ref, struct rsi, h.ref); + + call_rcu(&rsii->rcu_head, rsi_free_rcu); +} + static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) @@ -282,7 +291,7 @@ static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) struct cache_head *ch; int hash = rsi_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else @@ -330,6 +339,7 @@ struct rsc { struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; + struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); @@ -343,12 +353,22 @@ static void rsc_free(struct rsc *rsci) free_svc_cred(&rsci->cred); } +static void rsc_free_rcu(struct rcu_head *head) +{ + struct rsc *rsci = container_of(head, struct rsc, rcu_head); + + kfree(rsci->handle.data); + kfree(rsci); +} + static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); - rsc_free(rsci); - kfree(rsci); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + free_svc_cred(&rsci->cred); + call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int @@ -542,7 +562,7 @@ static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) struct cache_head *ch; int hash = rsc_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else @@ -1764,14 +1784,21 @@ out_err: } static void -svcauth_gss_domain_release(struct auth_domain *dom) +svcauth_gss_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 4b48228ee8c7..2694a1bc026b 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -21,7 +21,7 @@ static struct rpc_cred null_cred; static struct rpc_auth * nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - atomic_inc(&null_auth.au_count); + refcount_inc(&null_auth.au_count); return &null_auth; } @@ -119,7 +119,7 @@ struct rpc_auth null_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static @@ -138,6 +138,6 @@ struct rpc_cred null_cred = { .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), .cr_auth = &null_auth, .cr_ops = &null_credops, - .cr_count = ATOMIC_INIT(1), + .cr_count = REFCOUNT_INIT(2), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 185e56d4f9ae..4c1c7e56288f 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); - atomic_inc(&unix_auth.au_count); + refcount_inc(&unix_auth.au_count); return &unix_auth; } @@ -239,7 +239,7 @@ struct rpc_auth unix_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3c15a99b9700..fa5ba6ed3197 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) return NULL; req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); INIT_LIST_HEAD(&req->rq_bc_list); /* Preallocate one XDR receive buffer */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 109fbe591e7b..f96345b1180e 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,28 +54,33 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } -struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, - struct cache_head *key, int hash) +static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, + struct cache_head *key, + int hash) { - struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; - struct hlist_head *head; - - head = &detail->hash_table[hash]; - - read_lock(&detail->hash_lock); + struct hlist_head *head = &detail->hash_table[hash]; + struct cache_head *tmp; - hlist_for_each_entry(tmp, head, cache_list) { + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) - /* This entry is expired, we will discard it. */ - break; - cache_get(tmp); - read_unlock(&detail->hash_lock); + continue; + tmp = cache_get_rcu(tmp); + rcu_read_unlock(); return tmp; } } - read_unlock(&detail->hash_lock); - /* Didn't find anything, insert an empty entry */ + rcu_read_unlock(); + return NULL; +} + +static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, + struct cache_head *key, + int hash) +{ + struct cache_head *new, *tmp, *freeme = NULL; + struct hlist_head *head = &detail->hash_table[hash]; new = detail->alloc(); if (!new) @@ -87,35 +92,46 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, cache_init(new, detail); detail->init(new, key); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - hlist_for_each_entry(tmp, head, cache_list) { + hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - hlist_del_init(&tmp->cache_list); + hlist_del_init_rcu(&tmp->cache_list); detail->entries --; freeme = tmp; break; } cache_get(tmp); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_put(new, detail); return tmp; } } - hlist_add_head(&new->cache_list, head); + hlist_add_head_rcu(&new->cache_list, head); detail->entries++; cache_get(new); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); if (freeme) cache_put(freeme, detail); return new; } -EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head *ret; + + ret = sunrpc_cache_find_rcu(detail, key, hash); + if (ret) + return ret; + /* Didn't find anything, insert an empty entry */ + return sunrpc_cache_add_entry(detail, key, hash); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); @@ -151,18 +167,18 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); cache_fresh_locked(old, new->expiry_time, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } /* We need to insert a new entry */ tmp = detail->alloc(); @@ -173,7 +189,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_init(tmp, detail); detail->init(tmp, old); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else @@ -183,7 +199,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); cache_put(old, detail); @@ -223,7 +239,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h { int rv; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); @@ -231,7 +247,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h detail); rv = -ENOENT; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(h, detail); return rv; } @@ -341,7 +357,7 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { - rwlock_init(&cd->hash_lock); + spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); cd->nextcheck = 0; @@ -361,11 +377,11 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ @@ -422,7 +438,7 @@ static int cache_clean(void) struct hlist_head *head; struct hlist_node *tmp; - write_lock(¤t_detail->hash_lock); + spin_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ @@ -433,13 +449,13 @@ static int cache_clean(void) if (!cache_is_expired(current_detail, ch)) continue; - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); current_detail->entries--; rv = 1; break; } - write_unlock(¤t_detail->hash_lock); + spin_unlock(¤t_detail->hash_lock); d = current_detail; if (!ch) current_index ++; @@ -494,9 +510,9 @@ void cache_purge(struct cache_detail *detail) struct hlist_node *tmp = NULL; int i = 0; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!detail->entries) { - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); return; } @@ -504,17 +520,17 @@ void cache_purge(struct cache_detail *detail) for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); detail->entries--; set_bit(CACHE_CLEANED, &ch->flags); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(ch, detail); cache_put(ch, detail); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); } } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1289,21 +1305,19 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -void *cache_seq_start(struct seq_file *m, loff_t *pos) - __acquires(cd->hash_lock) +static void *__cache_seq_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; struct cache_detail *cd = m->private; - read_lock(&cd->hash_lock); if (!n--) return SEQ_START_TOKEN; hash = n >> 32; entry = n & ((1LL<<32) - 1); - hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) + hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1315,12 +1329,12 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos) if (hash >= cd->hash_size) return NULL; *pos = n+1; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } -EXPORT_SYMBOL_GPL(cache_seq_start); -void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) +static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); @@ -1333,7 +1347,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) *pos += 1LL<<32; } else { ++*pos; - return hlist_entry_safe(ch->cache_list.next, + return hlist_entry_safe(rcu_dereference_raw( + hlist_next_rcu(&ch->cache_list)), struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); @@ -1345,18 +1360,32 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) if (hash >= cd->hash_size) return NULL; ++*pos; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } EXPORT_SYMBOL_GPL(cache_seq_next); -void cache_seq_stop(struct seq_file *m, void *p) - __releases(cd->hash_lock) +void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) + __acquires(RCU) { - struct cache_detail *cd = m->private; - read_unlock(&cd->hash_lock); + rcu_read_lock(); + return __cache_seq_start(m, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_start_rcu); + +void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos) +{ + return cache_seq_next(file, p, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_next_rcu); + +void cache_seq_stop_rcu(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(cache_seq_stop); +EXPORT_SYMBOL_GPL(cache_seq_stop_rcu); static int c_show(struct seq_file *m, void *p) { @@ -1384,9 +1413,9 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = c_show, }; @@ -1844,13 +1873,13 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ - hlist_del_init(&h->cache_list); + hlist_del_init_rcu(&h->cache_list); cd->entries--; - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); cache_put(h, cd); } else - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); } EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ea2f5fadd96..ae3b8145da35 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -61,6 +61,7 @@ static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); @@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; - struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, - .flags = RPC_TASK_SOFTCONN, + .flags = RPC_TASK_SOFTCONN | + RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - task->tk_rqstp = req; - - /* - * Set up the xdr_buf length. - * This also indicates that the buffer is XDR encoded already. - */ - xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + - xbufp->tail[0].iov_len; + xprt_init_bc_request(req, task); task->tk_action = call_bc_transmit; atomic_inc(&task->tk_count); @@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task) task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { - xprt_request_init(task); task->tk_action = call_refresh; return; } @@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_bind; + task->tk_action = call_encode; if (req->rq_buffer) return; @@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } -static inline int +static int rpc_task_need_encode(struct rpc_task *task) { - return task->tk_rqstp->rq_snd_buf.len == 0; + return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && + (!(task->tk_flags & RPC_TASK_SENT) || + !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || + xprt_request_need_retransmit(task)); } -static inline void -rpc_task_force_reencode(struct rpc_task *task) -{ - task->tk_rqstp->rq_snd_buf.len = 0; - task->tk_rqstp->rq_bytes_sent = 0; -} - -/* - * 3. Encode arguments of an RPC call - */ static void rpc_xdr_encode(struct rpc_task *task) { @@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task) xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); + req->rq_bytes_sent = 0; p = rpc_encode_header(task); if (p == NULL) { @@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task) task->tk_status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); + if (task->tk_status == 0) + xprt_request_prepare(req); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + if (!rpc_task_need_encode(task)) + goto out; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + rpc_xdr_encode(task); + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); + return; + } + + /* Add task to reply queue before transmission to avoid races */ + if (rpc_reply_expected(task)) + xprt_request_enqueue_receive(task); + xprt_request_enqueue_transmit(task); +out: + task->tk_action = call_bind; } /* @@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task) static void call_transmit(struct rpc_task *task) { - int is_retrans = RPC_WAS_SENT(task); - dprint_status(task); - task->tk_action = call_status; - if (task->tk_status < 0) - return; - if (!xprt_prepare_transmit(task)) - return; - task->tk_action = call_transmit_status; - /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (rpc_task_need_encode(task)) { - rpc_xdr_encode(task); - /* Did the encode result in an error condition? */ - if (task->tk_status != 0) { - /* Was the error nonfatal? */ - if (task->tk_status == -EAGAIN) - rpc_delay(task, HZ >> 4); - else - rpc_exit(task, task->tk_status); + task->tk_status = 0; + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + if (!xprt_prepare_transmit(task)) return; - } + xprt_transmit(task); } - xprt_transmit(task); - if (task->tk_status < 0) - return; - if (is_retrans) - task->tk_client->cl_stats->rpcretrans++; - /* - * On success, ensure that we call xprt_end_transmit() before sleeping - * in order to allow access to the socket to other RPC requests. - */ - call_transmit_status(task); - if (rpc_reply_expected(task)) - return; - task->tk_action = rpc_exit_task; - rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task); + task->tk_action = call_transmit_status; + xprt_end_transmit(task); } /* @@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (task->tk_status == 0) { - xprt_end_transmit(task); - rpc_task_force_reencode(task); + xprt_request_wait_receive(task); return; } switch (task->tk_status) { - case -EAGAIN: - case -ENOBUFS: - break; default: dprint_status(task); - xprt_end_transmit(task); - rpc_task_force_reencode(task); + break; + case -EBADMSG: + task->tk_status = 0; + task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the @@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOBUFS: + rpc_delay(task, HZ>>2); + /* fall through */ + case -EBADSLT: + case -EAGAIN: + task->tk_action = call_transmit; + task->tk_status = 0; + break; case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: @@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task) case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) { - xprt_end_transmit(task); if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); @@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task) case -EADDRINUSE: case -ENOTCONN: case -EPIPE: - rpc_task_force_reencode(task); + break; } } @@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_need_encode(task)) + xprt_request_enqueue_transmit(task); + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + goto out_wakeup; + if (!xprt_prepare_transmit(task)) goto out_retry; @@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); goto out_done; } - if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) - req->rq_bytes_sent = 0; xprt_transmit(task); - if (task->tk_status == -EAGAIN) - goto out_nospace; - xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { @@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -EAGAIN: + goto out_retry; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); break; } +out_wakeup: rpc_wake_up_queued_task(&req->rq_xprt->pending, task); out_done: task->tk_action = rpc_exit_task; return; -out_nospace: - req->rq_connect_cookie = req->rq_xprt->connect_cookie; out_retry: task->tk_status = 0; } @@ -2125,15 +2121,11 @@ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) - task->tk_status = req->rq_reply_bytes_recvd; - dprint_status(task); status = task->tk_status; @@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task) /* fall through */ case -EPIPE: case -ENOTCONN: - task->tk_action = call_bind; - break; - case -ENOBUFS: - rpc_delay(task, HZ>>2); - /* fall through */ case -EAGAIN: - task->tk_action = call_transmit; + task->tk_action = call_encode; break; case -EIO: /* shutdown or soft timeout */ @@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task) rpcauth_invalcred(task); retry: - task->tk_action = call_bind; + task->tk_action = call_encode; task->tk_status = 0; } @@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task) dprint_status(task); + if (!decode) { + task->tk_action = rpc_exit_task; + return; + } + if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s OK\n", @@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task) if (req->rq_rcv_buf.len < 12) { if (!RPC_IS_SOFT(task)) { - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; } dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", @@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task) goto out_retry; return; } - task->tk_action = rpc_exit_task; - if (decode) { - task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, - task->tk_msg.rpc_resp); - } + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, task->tk_status); return; @@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task) task->tk_garb_retry--; dprintk("RPC: %5u %s: retry garbled creds\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; case RPC_AUTH_TOOWEAK: printk(KERN_NOTICE "RPC: server %s requires stronger " @@ -2485,7 +2475,7 @@ out_garbage: task->tk_garb_retry--; dprintk("RPC: %5u %s: retrying\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; out_retry: return ERR_PTR(-EAGAIN); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3fe5d60ab0e2..57ca5bead1cb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -99,64 +99,78 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } -static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue) -{ - struct list_head *q = &queue->tasks[queue->priority]; - struct rpc_task *task; - - if (!list_empty(q)) { - task = list_first_entry(q, struct rpc_task, u.tk_wait.list); - if (task->tk_owner == queue->owner) - list_move_tail(&task->u.tk_wait.list, q); - } -} - static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { if (queue->priority != priority) { - /* Fairness: rotate the list when changing priority */ - rpc_rotate_queue_owner(queue); queue->priority = priority; + queue->nr = 1U << priority; } } -static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) -{ - queue->owner = pid; - queue->nr = RPC_BATCH_COUNT; -} - static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) { rpc_set_waitqueue_priority(queue, queue->maxpriority); - rpc_set_waitqueue_owner(queue, 0); } /* - * Add new request to a priority queue. + * Add a request to a queue list */ -static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, - struct rpc_task *task, - unsigned char queue_priority) +static void +__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task) { - struct list_head *q; struct rpc_task *t; - INIT_LIST_HEAD(&task->u.tk_wait.links); - if (unlikely(queue_priority > queue->maxpriority)) - queue_priority = queue->maxpriority; - if (queue_priority > queue->priority) - rpc_set_waitqueue_priority(queue, queue_priority); - q = &queue->tasks[queue_priority]; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { - list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + list_add_tail(&task->u.tk_wait.links, + &t->u.tk_wait.links); + /* Cache the queue head in task->u.tk_wait.list */ + task->u.tk_wait.list.next = q; + task->u.tk_wait.list.prev = NULL; return; } } + INIT_LIST_HEAD(&task->u.tk_wait.links); list_add_tail(&task->u.tk_wait.list, q); } +/* + * Remove request from a queue list + */ +static void +__rpc_list_dequeue_task(struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + if (task->u.tk_wait.list.prev == NULL) { + list_del(&task->u.tk_wait.links); + return; + } + if (!list_empty(&task->u.tk_wait.links)) { + t = list_first_entry(&task->u.tk_wait.links, + struct rpc_task, + u.tk_wait.links); + /* Assume __rpc_list_enqueue_task() cached the queue head */ + q = t->u.tk_wait.list.next; + list_add_tail(&t->u.tk_wait.list, q); + list_del(&task->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (unlikely(queue_priority > queue->maxpriority)) + queue_priority = queue->maxpriority; + __rpc_list_enqueue_task(&queue->tasks[queue_priority], task); +} + /* * Add new request to wait queue. * @@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, */ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) { - struct rpc_task *t; - - if (!list_empty(&task->u.tk_wait.links)) { - t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); - list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); - list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); - } + __rpc_list_dequeue_task(task); } /* @@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); - list_del(&task->u.tk_wait.list); + else + list_del(&task->u.tk_wait.list); queue->qlen--; dprintk("RPC: %5u removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); @@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) +static struct rpc_task * +rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task, + bool (*action)(struct rpc_task *, void *), void *data) { if (RPC_IS_QUEUED(task)) { smp_rmb(); - if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task_on_wq(wq, queue, task); + if (task->tk_waitqueue == queue) { + if (action == NULL || action(task, data)) { + __rpc_do_wake_up_task_on_wq(wq, queue, task); + return task; + } + } } + return NULL; +} + +static void +rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); } /* @@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); spin_unlock_bh(&queue->lock); @@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); +static bool rpc_task_action_set_status(struct rpc_task *task, void *status) +{ + task->tk_status = *(int *)status; + return true; +} + +static void +rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, rpc_task_action_set_status, &status); +} + +/** + * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status + * @queue: pointer to rpc_wait_queue + * @task: pointer to rpc_task + * @status: integer error value + * + * If @task is queued on @queue, then it is woken up, and @task->tk_status is + * set to the value of @status. + */ +void +rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + if (!RPC_IS_QUEUED(task)) + return; + spin_lock_bh(&queue->lock); + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + spin_unlock_bh(&queue->lock); +} + /* * Wake up the next task on a priority queue. */ @@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); - if (queue->owner == task->tk_owner) { - if (--queue->nr) - goto out; - list_move_tail(&task->u.tk_wait.list, q); - } - /* - * Check if we need to switch queues. - */ - goto new_owner; + if (!list_empty(q) && --queue->nr) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; } /* @@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q else q = q - 1; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); @@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q new_queue: rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); -new_owner: - rpc_set_waitqueue_owner(queue, task->tk_owner); out: return task; } @@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, queue, rpc_qname(queue)); spin_lock_bh(&queue->lock); task = __rpc_find_next_queued(queue); - if (task != NULL) { - if (func(task, data)) - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - else - task = NULL; - } + if (task != NULL) + task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, + task, func, data); spin_unlock_bh(&queue->lock); return task; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index f217c348b341..9062967575c4 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -26,7 +26,8 @@ * Possibly called several times to iterate over an sk_buff and copy * data out of it. */ -size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) +static size_t +xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { if (len > desc->count) len = desc->count; @@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) desc->offset += len; return len; } -EXPORT_SYMBOL_GPL(xdr_skb_read_bits); /** * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer @@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, * @copy_actor: virtual method for copying data * */ -ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +static ssize_t +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { + if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { *ppage = alloc_page(GFP_ATOMIC); if (unlikely(*ppage == NULL)) { if (copied == 0) @@ -140,7 +141,6 @@ copy_tail: out: return copied; } -EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); /** * csum_partial_copy_to_xdr - checksum and copy data diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5185efb9027b..51d36230b6e3 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); xprt->xpt_net = get_net(net); strcpy(xprt->xpt_remotebuf, "uninitialized"); } @@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); - rpc_wake_up(&xprt->xpt_bc_pending); trace_svc_send(rqstp, len); svc_xprt_release(rqstp); @@ -989,7 +987,7 @@ static void call_xpt_users(struct svc_xprt *xprt) spin_lock(&xprt->xpt_lock); while (!list_empty(&xprt->xpt_users)) { u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list); - list_del(&u->list); + list_del_init(&u->list); u->callback(u); } spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index bb8db3cb8032..775b8c94265b 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -27,12 +27,32 @@ extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static DEFINE_SPINLOCK(authtab_lock); -static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { - [0] = &svcauth_null, - [1] = &svcauth_unix, +static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null, + [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix, }; +static struct auth_ops * +svc_get_auth_ops(rpc_authflavor_t flavor) +{ + struct auth_ops *aops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + rcu_read_lock(); + aops = rcu_dereference(authtab[flavor]); + if (aops != NULL && !try_module_get(aops->owner)) + aops = NULL; + rcu_read_unlock(); + return aops; +} + +static void +svc_put_auth_ops(struct auth_ops *aops) +{ + module_put(aops->owner); +} + int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) { @@ -45,14 +65,11 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) dprintk("svc: svc_authenticate (%d)\n", flavor); - spin_lock(&authtab_lock); - if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) || - !try_module_get(aops->owner)) { - spin_unlock(&authtab_lock); + aops = svc_get_auth_ops(flavor); + if (aops == NULL) { *authp = rpc_autherr_badcred; return SVC_DENIED; } - spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; init_svc_cred(&rqstp->rq_cred); @@ -82,7 +99,7 @@ int svc_authorise(struct svc_rqst *rqstp) if (aops) { rv = aops->release(rqstp); - module_put(aops->owner); + svc_put_auth_ops(aops); } return rv; } @@ -90,13 +107,14 @@ int svc_authorise(struct svc_rqst *rqstp) int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) { + struct auth_ops *old; int rv = -EINVAL; - spin_lock(&authtab_lock); - if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { - authtab[flavor] = aops; - rv = 0; + + if (flavor < RPC_AUTH_MAXFLAVOR) { + old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops); + if (old == NULL || old == aops) + rv = 0; } - spin_unlock(&authtab_lock); return rv; } EXPORT_SYMBOL_GPL(svc_auth_register); @@ -104,10 +122,8 @@ EXPORT_SYMBOL_GPL(svc_auth_register); void svc_auth_unregister(rpc_authflavor_t flavor) { - spin_lock(&authtab_lock); if (flavor < RPC_AUTH_MAXFLAVOR) - authtab[flavor] = NULL; - spin_unlock(&authtab_lock); + rcu_assign_pointer(authtab[flavor], NULL); } EXPORT_SYMBOL_GPL(svc_auth_unregister); @@ -127,10 +143,11 @@ static struct hlist_head auth_domain_table[DN_HASHMAX]; static DEFINE_SPINLOCK(auth_domain_lock); static void auth_domain_release(struct kref *kref) + __releases(&auth_domain_lock) { struct auth_domain *dom = container_of(kref, struct auth_domain, ref); - hlist_del(&dom->hash); + hlist_del_rcu(&dom->hash); dom->flavour->domain_release(dom); spin_unlock(&auth_domain_lock); } @@ -159,7 +176,7 @@ auth_domain_lookup(char *name, struct auth_domain *new) } } if (new) - hlist_add_head(&new->hash, head); + hlist_add_head_rcu(&new->hash, head); spin_unlock(&auth_domain_lock); return new; } @@ -167,6 +184,21 @@ EXPORT_SYMBOL_GPL(auth_domain_lookup); struct auth_domain *auth_domain_find(char *name) { - return auth_domain_lookup(name, NULL); + struct auth_domain *hp; + struct hlist_head *head; + + head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(hp, head, hash) { + if (strcmp(hp->name, name)==0) { + if (!kref_get_unless_zero(&hp->ref)) + hp = NULL; + rcu_read_unlock(); + return hp; + } + } + rcu_read_unlock(); + return NULL; } EXPORT_SYMBOL_GPL(auth_domain_find); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index af7f28fb8102..fb9041b92f72 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -37,20 +37,26 @@ struct unix_domain { extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static void svcauth_unix_domain_release(struct auth_domain *dom) +static void svcauth_unix_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct unix_domain *ud = container_of(dom, struct unix_domain, h); kfree(dom->name); kfree(ud); } +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_unix_domain_release_rcu); +} + struct auth_domain *unix_domain_find(char *name) { struct auth_domain *rv; struct unix_domain *new = NULL; - rv = auth_domain_lookup(name, NULL); + rv = auth_domain_find(name); while(1) { if (rv) { if (new && rv != &new->h) @@ -91,6 +97,7 @@ struct ip_map { char m_class[8]; /* e.g. "nfsd" */ struct in6_addr m_addr; struct unix_domain *m_client; + struct rcu_head m_rcu; }; static void ip_map_put(struct kref *kref) @@ -101,7 +108,7 @@ static void ip_map_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) auth_domain_put(&im->m_client->h); - kfree(im); + kfree_rcu(im, m_rcu); } static inline int hash_ip6(const struct in6_addr *ip) @@ -280,9 +287,9 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, strcpy(ip.m_class, class); ip.m_addr = *addr; - ch = sunrpc_cache_lookup(cd, &ip.h, - hash_str(class, IP_HASHBITS) ^ - hash_ip6(addr)); + ch = sunrpc_cache_lookup_rcu(cd, &ip.h, + hash_str(class, IP_HASHBITS) ^ + hash_ip6(addr)); if (ch) return container_of(ch, struct ip_map, h); @@ -412,6 +419,7 @@ struct unix_gid { struct cache_head h; kuid_t uid; struct group_info *gi; + struct rcu_head rcu; }; static int unix_gid_hash(kuid_t uid) @@ -426,7 +434,7 @@ static void unix_gid_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree(ug); + kfree_rcu(ug, rcu); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) @@ -619,7 +627,7 @@ static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid) struct cache_head *ch; ug.uid = uid; - ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid)); + ch = sunrpc_cache_lookup_rcu(cd, &ug.h, unix_gid_hash(uid)); if (ch) return container_of(ch, struct unix_gid, h); else diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5445145e639c..986f3ed7d1a2 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -325,59 +325,34 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) /* * Generic recvfrom routine. */ -static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, - int buflen) +static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, + unsigned int nr, size_t buflen, unsigned int base) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - int len; + struct msghdr msg = { NULL }; + ssize_t len; rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); - len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags); + iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); + if (base != 0) { + iov_iter_advance(&msg.msg_iter, base); + buflen -= base; + } + len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n", + dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } -static int svc_partial_recvfrom(struct svc_rqst *rqstp, - struct kvec *iov, int nr, - int buflen, unsigned int base) -{ - size_t save_iovlen; - void *save_iovbase; - unsigned int i; - int ret; - - if (base == 0) - return svc_recvfrom(rqstp, iov, nr, buflen); - - for (i = 0; i < nr; i++) { - if (iov[i].iov_len > base) - break; - base -= iov[i].iov_len; - } - save_iovlen = iov[i].iov_len; - save_iovbase = iov[i].iov_base; - iov[i].iov_len -= base; - iov[i].iov_base += base; - ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); - iov[i].iov_len = save_iovlen; - iov[i].iov_base = save_iovbase; - return ret; -} - /* * Set socket snd and rcv buffer lengths */ @@ -962,7 +937,8 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; iov.iov_len = want; - if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + len = svc_recvfrom(rqstp, &iov, 1, want, 0); + if (len < 0) goto error; svsk->sk_tcplen += len; @@ -1004,7 +980,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock(&bc_xprt->recv_lock); + spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1022,7 +998,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1031,7 +1007,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; } @@ -1088,14 +1064,13 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) vec = rqstp->rq_vec; - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], - svsk->sk_datalen + want); + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); rqstp->rq_respages = &rqstp->rq_pages[pnum]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Now receive data */ - len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + len = svc_recvfrom(rqstp, vec, pnum, base + want, base); if (len >= 0) { svsk->sk_tcplen += len; svsk->sk_datalen += len; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 30afbd236656..2bbb8d38d2bf 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * XDR functions for basic NFS types @@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); +size_t +xdr_buf_pagecount(struct xdr_buf *buf) +{ + if (!buf->page_len) + return 0; + return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +int +xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) +{ + size_t i, n = xdr_buf_pagecount(buf); + + if (n != 0 && buf->bvec == NULL) { + buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + if (!buf->bvec) + return -ENOMEM; + for (i = 0; i < n; i++) { + buf->bvec[i].bv_page = buf->pages[i]; + buf->bvec[i].bv_len = PAGE_SIZE; + buf->bvec[i].bv_offset = 0; + } + } + return 0; +} + +void +xdr_free_bvec(struct xdr_buf *buf) +{ + kfree(buf->bvec); + buf->bvec = NULL; +} + void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a8db2e3f8904..86bea4520c4d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -68,8 +68,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_connect_status(struct rpc_task *task); -static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -171,6 +169,17 @@ out: } EXPORT_SYMBOL_GPL(xprt_load_transport); +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { + smp_mb__before_atomic(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_atomic(); + } else + queue_work(xprtiod_workqueue, &xprt->task_cleanup); +} + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport @@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; xprt->snd_task = task; - if (req != NULL) - req->rq_ntrans++; return 1; +out_unlock: + xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); -static void xprt_clear_locked(struct rpc_xprt *xprt) +static bool +xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { - xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else - queue_work(xprtiod_workqueue, &xprt->task_cleanup); + return test_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_set_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!list_empty(&xprt->xmit_queue)) { + /* Peek at head of queue to see if it can make progress */ + if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, + rq_xmit)->rq_cong) + return; + } + set_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!RPCXPRT_CONGESTED(xprt)) + clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* @@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. + * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) @@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt->snd_task = task; return 1; } - if (__xprt_get_cong(xprt, task)) { + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - req->rq_ntrans++; return 1; } +out_unlock: xprt_clear_locked(xprt); out_sleep: - if (req) - __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; + if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) + return 1; spin_lock_bh(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - req = task->tk_rqstp; xprt->snd_task = task; - if (req) - req->rq_ntrans++; return true; } @@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; +out_unlock: xprt_clear_locked(xprt); } -static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data) -{ - struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - - req = task->tk_rqstp; - if (req == NULL) { - xprt->snd_task = task; - return true; - } - if (__xprt_get_cong(xprt, task)) { - xprt->snd_task = task; - req->rq_ntrans++; - return true; - } - return false; -} - static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (RPCXPRT_CONGESTED(xprt)) + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, - __xprt_lock_write_cong_func, xprt)) + __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } -static void xprt_task_clear_bytes_sent(struct rpc_task *task) -{ - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } -} - /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task) void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { + if (xprt->snd_task != task) + return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta * overflowed. Put the task to sleep if this is the case. */ static int -__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; - if (req->rq_cong) return 1; dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - task->tk_pid, xprt->cong, xprt->cwnd); - if (RPCXPRT_CONGESTED(xprt)) + req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) { + xprt_set_congestion_window_wait(xprt); return 0; + } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; @@ -414,9 +401,31 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; + xprt_test_and_clear_congestion_window_wait(xprt); __xprt_lock_write_next_cong(xprt); } +/** + * xprt_request_get_cong - Request congestion control credits + * @xprt: pointer to transport + * @req: pointer to RPC request + * + * Useful for transports that require congestion control. + */ +bool +xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + bool ret = false; + + if (req->rq_cong) + return true; + spin_lock_bh(&xprt->transport_lock); + ret = __xprt_get_cong(xprt, req) != 0; + spin_unlock_bh(&xprt->transport_lock); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_request_get_cong); + /** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed @@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +/* + * Clear the congestion window wait flag and wake up the next + * entry on xprt->sending + */ +static void +xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { + spin_lock_bh(&xprt->transport_lock); + __xprt_lock_write_next_cong(xprt); + spin_unlock_bh(&xprt->transport_lock); + } +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt @@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear - * @task: task to be put to sleep - * @action: function pointer to be executed after wait + * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ -void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) +void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; - - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; - rpc_sleep_on(&xprt->pending, task, action); + set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); +static bool +xprt_clear_write_space_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { + __xprt_lock_write_next(xprt); + dprintk("RPC: write space: waking waiting task on " + "xprt %p\n", xprt); + return true; + } + return false; +} + /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ -void xprt_write_space(struct rpc_xprt *xprt) +bool xprt_write_space(struct rpc_xprt *xprt) { + bool ret; + + if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) + return false; spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) { - dprintk("RPC: write space: waking waiting task on " - "xprt %p\n", xprt); - rpc_wake_up_queued_task_on_wq(xprtiod_workqueue, - &xprt->pending, xprt->snd_task); - } + ret = xprt_clear_write_space_locked(xprt); spin_unlock_bh(&xprt->transport_lock); + return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); @@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); + xprt_clear_write_space_locked(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_force_disconnect); +static unsigned int +xprt_connect_cookie(struct rpc_xprt *xprt) +{ + return READ_ONCE(xprt->connect_cookie); +} + +static bool +xprt_request_retransmit_after_disconnect(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + return req->rq_connect_cookie != xprt_connect_cookie(xprt) || + !xprt_connected(xprt); +} + /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect @@ -692,7 +739,7 @@ static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { - if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } @@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t) struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv)) + if (!RB_EMPTY_ROOT(&xprt->recv_queue)) goto out_abort; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; @@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; - xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: @@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); @@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } +enum xprt_xid_rb_cmp { + XID_RB_EQUAL, + XID_RB_LEFT, + XID_RB_RIGHT, +}; +static enum xprt_xid_rb_cmp +xprt_xid_cmp(__be32 xid1, __be32 xid2) +{ + if (xid1 == xid2) + return XID_RB_EQUAL; + if ((__force u32)xid1 < (__force u32)xid2) + return XID_RB_LEFT; + return XID_RB_RIGHT; +} + +static struct rpc_rqst * +xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) +{ + struct rb_node *n = xprt->recv_queue.rb_node; + struct rpc_rqst *req; + + while (n != NULL) { + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch (xprt_xid_cmp(xid, req->rq_xid)) { + case XID_RB_LEFT: + n = n->rb_left; + break; + case XID_RB_RIGHT: + n = n->rb_right; + break; + case XID_RB_EQUAL: + return req; + } + } + return NULL; +} + +static void +xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) +{ + struct rb_node **p = &xprt->recv_queue.rb_node; + struct rb_node *n = NULL; + struct rpc_rqst *req; + + while (*p != NULL) { + n = *p; + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { + case XID_RB_LEFT: + p = &n->rb_left; + break; + case XID_RB_RIGHT: + p = &n->rb_right; + break; + case XID_RB_EQUAL: + WARN_ON_ONCE(new != req); + return; + } + } + rb_link_node(&new->rq_recv, n, p); + rb_insert_color(&new->rq_recv, &xprt->recv_queue); +} + +static void +xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + rb_erase(&req->rq_recv, &xprt->recv_queue); +} + /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; - list_for_each_entry(entry, &xprt->recv, rq_list) - if (entry->rq_xid == xid) { - trace_xprt_lookup_rqst(xprt, xid, 0); - entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); - return entry; - } + entry = xprt_request_rb_find(xprt, xid); + if (entry != NULL) { + trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); + return entry; + } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); @@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +static bool +xprt_is_pinned_rqst(struct rpc_rqst *req) +{ + return atomic_read(&req->rq_pin) != 0; +} + /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt transport lock. + * so should be holding the xprt receive lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { - set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); + atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); @@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt transport lock. + * Caller should be holding the xprt receive lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { - struct rpc_task *task = req->rq_task; - - clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); - if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) - wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); + if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { + atomic_dec(&req->rq_pin); + return; + } + if (atomic_dec_and_test(&req->rq_pin)) + wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) -__must_hold(&req->rq_xprt->recv_lock) { - struct rpc_task *task = req->rq_task; + wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); +} - if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { - spin_unlock(&req->rq_xprt->recv_lock); - set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, - TASK_UNINTERRUPTIBLE); - clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - spin_lock(&req->rq_xprt->recv_lock); - } +static bool +xprt_request_data_received(struct rpc_task *task) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; +} + +static bool +xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; +} + +/** + * xprt_request_enqueue_receive - Add an request to the receive queue + * @task: RPC task + * + */ +void +xprt_request_enqueue_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!xprt_request_need_enqueue_receive(task, req)) + return; + spin_lock(&xprt->queue_lock); + + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + + /* Add request to the receive list */ + xprt_request_rb_insert(xprt, req); + set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + + xprt_reset_majortimeo(req); + /* Turn off autodisconnect */ + del_singleshot_timer_sync(&xprt->timer); +} + +/** + * xprt_request_dequeue_receive_locked - Remove a request from the receive queue + * @task: RPC task + * + * Caller must hold xprt->queue_lock. + */ +static void +xprt_request_dequeue_receive_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { @@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; - list_del_init(&req->rq_list); req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; + xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); @@ -956,6 +1119,172 @@ static void xprt_timer(struct rpc_task *task) task->tk_status = 0; } +/** + * xprt_request_wait_receive - wait for the reply to an RPC request + * @task: RPC task about to send a request + * + */ +void xprt_request_wait_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + return; + /* + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). + */ + spin_lock(&xprt->queue_lock); + if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + xprt->ops->set_retrans_timeout(task); + rpc_sleep_on(&xprt->pending, task, xprt_timer); + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (xprt_request_retransmit_after_disconnect(task)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + task, -ENOTCONN); + } + spin_unlock(&xprt->queue_lock); +} + +static bool +xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); +} + +/** + * xprt_request_enqueue_transmit - queue a task for transmission + * @task: pointer to rpc_task + * + * Add a task to the transmission queue. + */ +void +xprt_request_enqueue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *pos, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt_request_need_enqueue_transmit(task, req)) { + spin_lock(&xprt->queue_lock); + /* + * Requests that carry congestion control credits are added + * to the head of the list to avoid starvation issues. + */ + if (req->rq_cong) { + xprt_clear_congestion_window_wait(xprt); + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else if (RPC_IS_SWAPPER(task)) { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong || pos->rq_bytes_sent) + continue; + if (RPC_IS_SWAPPER(pos->rq_task)) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_task->tk_owner != task->tk_owner) + continue; + list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); + INIT_LIST_HEAD(&req->rq_xmit); + goto out; + } + } + list_add_tail(&req->rq_xmit, &xprt->xmit_queue); + INIT_LIST_HEAD(&req->rq_xmit2); +out: + set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + } +} + +/** + * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + * Caller must hold xprt->queue_lock + */ +static void +xprt_request_dequeue_transmit_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + return; + if (!list_empty(&req->rq_xmit)) { + list_del(&req->rq_xmit); + if (!list_empty(&req->rq_xmit2)) { + struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, + struct rpc_rqst, rq_xmit2); + list_del(&req->rq_xmit2); + list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); + } + } else + list_del(&req->rq_xmit2); +} + +/** + * xprt_request_dequeue_transmit - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + */ +static void +xprt_request_dequeue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + spin_unlock(&xprt->queue_lock); +} + +/** + * xprt_request_prepare - prepare an encoded request for transport + * @req: pointer to rpc_rqst + * + * Calls into the transport layer to do whatever is needed to prepare + * the request for transmission or receive. + */ +void +xprt_request_prepare(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt->ops->prepare_request) + xprt->ops->prepare_request(req); +} + +/** + * xprt_request_need_retransmit - Test if a task needs retransmission + * @task: pointer to rpc_task + * + * Test for whether a connection breakage requires the task to retransmit + */ +bool +xprt_request_need_retransmit(struct rpc_task *task) +{ + return xprt_request_retransmit_after_disconnect(task); +} + /** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request @@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - bool ret = false; dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); - if (!req->rq_bytes_sent) { - if (req->rq_reply_bytes_recvd) { - task->tk_status = req->rq_reply_bytes_recvd; - goto out_unlock; - } - if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && xprt_connected(xprt) - && req->rq_connect_cookie == xprt->connect_cookie) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); - goto out_unlock; - } - } - if (!xprt->ops->reserve_xprt(xprt, task)) { - task->tk_status = -EAGAIN; - goto out_unlock; + if (!xprt_lock_write(xprt, task)) { + /* Race breaker: someone may have transmitted us */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + rpc_wake_up_queued_task_set_status(&xprt->sending, + task, 0); + return false; + } - ret = true; -out_unlock: - spin_unlock_bh(&xprt->transport_lock); - return ret; + return true; } void xprt_end_transmit(struct rpc_task *task) @@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task) } /** - * xprt_transmit - send an RPC request on a transport - * @task: controlling RPC task + * xprt_request_transmit - send an RPC request on a transport + * @req: pointer to request to transmit + * @snd_task: RPC task that owns the transport lock * - * We have to copy the iovec because sendmsg fiddles with its contents. + * This performs the transmission of a single request. + * Note that if the request is not the same as snd_task, then it + * does need to be pinned. + * Returns '0' on success. */ -void xprt_transmit(struct rpc_task *task) +static int +xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_task *task = req->rq_task; unsigned int connect_cookie; + int is_retrans = RPC_WAS_SENT(task); int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - if (!req->rq_reply_bytes_recvd) { - if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { - /* - * Add to the list only if we're expecting a reply - */ - /* Update the softirq receive buffer */ - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(req->rq_private_buf)); - /* Add request to the receive list */ - spin_lock(&xprt->recv_lock); - list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock(&xprt->recv_lock); - xprt_reset_majortimeo(req); - /* Turn off autodisconnect */ - del_singleshot_timer_sync(&xprt->timer); + if (!req->rq_bytes_sent) { + if (xprt_request_data_received(task)) { + status = 0; + goto out_dequeue; } - } else if (!req->rq_bytes_sent) - return; + /* Verify that our message lies in the RPCSEC_GSS window */ + if (rpcauth_xmit_need_reencode(task)) { + status = -EBADMSG; + goto out_dequeue; + } + } + + /* + * Update req->rq_ntrans before transmitting to avoid races with + * xprt_update_rtt(), which needs to know that it is recording a + * reply to the first transmission. + */ + req->rq_ntrans++; connect_cookie = xprt->connect_cookie; - status = xprt->ops->send_request(task); + status = xprt->ops->send_request(req); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { - task->tk_status = status; - return; + req->rq_ntrans--; + return status; } + + if (is_retrans) + task->tk_client->cl_stats->rpcretrans++; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; spin_lock_bh(&xprt->transport_lock); - xprt->ops->set_retrans_timeout(task); - xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; - if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { - /* - * Sleep on the pending queue if we're expecting a reply. - * The spinlock ensures atomicity between the test of - * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). - */ - spin_lock(&xprt->recv_lock); - if (!req->rq_reply_bytes_recvd) { - rpc_sleep_on(&xprt->pending, task, xprt_timer); - /* - * Send an extra queue wakeup call if the - * connection was dropped in case the call to - * rpc_sleep_on() raced. - */ - if (!xprt_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock(&xprt->recv_lock); +out_dequeue: + xprt_request_dequeue_transmit(task); + rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); + return status; +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * Attempts to drain the transmit queue. On exit, either the transport + * signalled an error that needs to be handled before transmission can + * resume, or @task finished transmitting, and detected that it already + * received a reply. + */ +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_rqst *next, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status; + + spin_lock(&xprt->queue_lock); + while (!list_empty(&xprt->xmit_queue)) { + next = list_first_entry(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + xprt_pin_rqst(next); + spin_unlock(&xprt->queue_lock); + status = xprt_request_transmit(next, task); + if (status == -EBADMSG && next != req) + status = 0; + cond_resched(); + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(next); + if (status == 0) { + if (!xprt_request_data_received(task) || + test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + continue; + } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; } + spin_unlock(&xprt->queue_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) @@ -1170,20 +1517,6 @@ out_init_req: } EXPORT_SYMBOL_GPL(xprt_alloc_slot); -void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) -{ - /* Note: grabbing the xprt_lock_write() ensures that we throttle - * new slot allocation if the transport is congested (i.e. when - * reconnecting a stream transport or when out of socket write - * buffer space). - */ - if (xprt_lock_write(xprt, task)) { - xprt_alloc_slot(xprt, task); - xprt_release_write(xprt, task); - } -} -EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); - void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); @@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_free); +static void +xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) +{ + req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; +} + +static __be32 +xprt_alloc_xid(struct rpc_xprt *xprt) +{ + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; +} + +static void +xprt_init_xid(struct rpc_xprt *xprt) +{ + xprt->xid = prandom_u32(); +} + +static void +xprt_request_init(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); + xprt_init_connect_cookie(req, xprt); + req->rq_bytes_sent = 0; + req->rq_snd_buf.len = 0; + req->rq_snd_buf.buflen = 0; + req->rq_rcv_buf.len = 0; + req->rq_rcv_buf.buflen = 0; + req->rq_release_snd_buf = NULL; + xprt_reset_majortimeo(req); + dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +static void +xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) +{ + xprt->ops->alloc_slot(xprt, task); + if (task->tk_rqstp != NULL) + xprt_request_init(task); +} + /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation @@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) - xprt->ops->alloc_slot(xprt, task); + xprt_do_reserve(xprt, task); } /** @@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - xprt->ops->alloc_slot(xprt, task); -} - -static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) -{ - __be32 xid; - - spin_lock(&xprt->reserve_lock); - xid = (__force __be32)xprt->xid++; - spin_unlock(&xprt->reserve_lock); - return xid; + xprt_do_reserve(xprt, task); } -static inline void xprt_init_xid(struct rpc_xprt *xprt) -{ - xprt->xid = prandom_u32(); -} - -void xprt_request_init(struct rpc_task *task) +static void +xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) { - struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - INIT_LIST_HEAD(&req->rq_list); - req->rq_timeout = task->tk_client->cl_timeout->to_initval; - req->rq_task = task; - req->rq_xprt = xprt; - req->rq_buffer = NULL; - req->rq_xid = xprt_alloc_xid(xprt); - req->rq_connect_cookie = xprt->connect_cookie - 1; - req->rq_bytes_sent = 0; - req->rq_snd_buf.len = 0; - req->rq_snd_buf.buflen = 0; - req->rq_rcv_buf.len = 0; - req->rq_rcv_buf.buflen = 0; - req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); - dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, - req, ntohl(req->rq_xid)); + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } } /** @@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task) if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; - if (xprt->snd_task == task) - xprt_release_write(xprt, task); + xprt_release_write(xprt, task); } return; } @@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); - spin_lock(&xprt->recv_lock); - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - xprt_wait_on_pinned_rqst(req); - } - spin_unlock(&xprt->recv_lock); + xprt_request_dequeue_all(task, req); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) @@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task) if (req->rq_buffer) xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); + xdr_free_bvec(&req->rq_rcv_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; @@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task) xprt_free_bc_request(req); } +#ifdef CONFIG_SUNRPC_BACKCHANNEL +void +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +{ + struct xdr_buf *xbufp = &req->rq_snd_buf; + + task->tk_rqstp = req; + req->rq_task = task; + xprt_init_connect_cookie(req, req->rq_xprt); + /* + * Set up the xdr_buf length. + * This also indicates that the buffer is XDR encoded already. + */ + xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + + xbufp->tail[0].iov_len; + req->rq_bytes_sent = 0; +} +#endif + static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - spin_lock_init(&xprt->recv_lock); + spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); - INIT_LIST_HEAD(&xprt->recv); + xprt->recv_queue = RB_ROOT; + INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); @@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); - rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 90adeff4c06b..e5b367a3e517 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, rqst = &req->rl_slot; rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (!xprt_connected(rqst->rq_xprt)) goto drop_connection; + if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; @@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { list_del(&rqst->rq_bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); rpcrdma_bc_free_rqst(r_xprt, rqst); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); } - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** @@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 0f7c465d9a5a..7f5632cd5a48 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); + trace_xprtrdma_mr_recycle(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); @@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1bb00dd6ccdb..fc6378cc0c1c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ out_not_supported: return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ out_list_err: return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); @@ -451,7 +418,7 @@ out_dmamap_err: out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_remoteinv(mr); + trace_xprtrdma_mr_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; @@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -582,22 +549,21 @@ unmap: } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c8ae983c6cc0..9f53e0240035 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ @@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations @@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) out_err: switch (ret) { case -EAGAIN: - xprt_wait_for_buffer_space(rqst->rq_task, NULL); + xprt_wait_for_buffer_space(rqst->rq_xprt); break; case -ENOBUFS: break; @@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpc_rqst *rqst = rep->rr_rqst; - unsigned long cwnd; int status; xprt->reestablish_timeout = 0; @@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) goto out_badheader; out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - + spin_lock(&xprt->queue_lock); xprt_complete_rqst(rqst->rq_task, status); xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); return; /* If the incoming reply terminated a pending RPC, the next @@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + spin_unlock(&xprt->queue_lock); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - buf->rb_credits = credits; - - spin_unlock(&xprt->recv_lock); + if (buf->rb_credits != credits) { + spin_lock_bh(&xprt->transport_lock); + buf->rb_credits = credits; + xprt->cwnd = credits << RPC_CWNDSHIFT; + spin_unlock_bh(&xprt->transport_lock); + } req = rpcr_to_rdmar(rqst); req->rl_reply = rep; @@ -1378,7 +1374,7 @@ out_badversion: * is corrupt. */ out_norqst: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); goto repost; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a68180090554..f3c147d70286 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -5,8 +5,6 @@ * Support for backward direction RPCs on RPC/RDMA (server-side). */ -#include - #include #include "xprt_rdma.h" @@ -32,7 +30,6 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct kvec *dst, *src = &rcvbuf->head[0]; struct rpc_rqst *req; - unsigned long cwnd; u32 credits; size_t len; __be32 xid; @@ -56,7 +53,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -66,6 +63,8 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (dst->iov_len < len) goto out_unlock; memcpy(dst->iov_base, p, len); + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); credits = be32_to_cpup(rdma_resp + 2); if (credits == 0) @@ -74,19 +73,17 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = r_xprt->rx_buf.rb_bc_max_requests; spin_lock_bh(&xprt->transport_lock); - cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(req->rq_task); spin_unlock_bh(&xprt->transport_lock); - + spin_lock(&xprt->queue_lock); ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); + xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); out: return ret; @@ -215,9 +212,8 @@ drop_connection: * connection. */ static int -xprt_rdma_bc_send_request(struct rpc_task *task) +xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; struct svcxprt_rdma *rdma; int ret; @@ -225,12 +221,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task) dprintk("svcrdma: sending bc call with xid: %08x\n", be32_to_cpu(rqst->rq_xid)); - if (!mutex_trylock(&sxprt->xpt_mutex)) { - rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&sxprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); - } + mutex_lock(&sxprt->xpt_mutex); ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); @@ -248,6 +239,7 @@ static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; } static void @@ -256,7 +248,6 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); xprt_free(xprt); - module_put(THIS_MODULE); } static const struct rpc_xprt_ops xprt_rdma_bc_procs = { @@ -328,20 +319,9 @@ xprt_setup_rdma_bc(struct xprt_create *args) args->bc_xprt->xpt_bc_xprt = xprt; xprt->bc_xprt = args->bc_xprt; - if (!try_module_get(THIS_MODULE)) - goto out_fail; - /* Final put for backchannel xprt is in __svc_rdma_free */ xprt_get(xprt); return xprt; - -out_fail: - xprt_rdma_free_addresses(xprt); - args->bc_xprt->xpt_bc_xprt = NULL; - args->bc_xprt->xpt_bc_xps = NULL; - xprt_put(xprt); - xprt_free(xprt); - return ERR_PTR(-EINVAL); } struct xprt_class xprt_rdma_bc = { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2848cafd4a17..2f7ec8912f49 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -475,10 +475,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - /* transport hdr, head iovec, one page list entry, tail iovec */ - if (newxprt->sc_max_send_sges < 4) { - pr_err("svcrdma: too few Send SGEs available (%d)\n", + /* Transport header, head iovec, tail iovec */ + newxprt->sc_max_send_sges = 3; + /* Add one SGE per page list entry */ + newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { + pr_err("svcrdma: too few Send SGEs available (%d needed)\n", newxprt->sc_max_send_sges); goto errout; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 143ce2579ba9..ae2a83828953 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) - xprt_wake_pending_tasks(xprt, 0); - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock_bh(&xprt->transport_lock); -} - +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ static void xprt_rdma_connect_worker(struct work_struct *work) { struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); + int rc; rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_wake_pending_tasks(xprt, -EAGAIN); + } + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, rc); + } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -442,11 +430,12 @@ out1: } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** @@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) @@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task) /** * xprt_rdma_send_request - marshal and send an RPC request - * @task: RPC task with an RPC message in rq_snd_buf + * @rqst: RPC message in rq_snd_buf * * Caller holds the transport's write lock. * @@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task) * sent. Do not try to send this message again. */ static int -xprt_rdma_send_request(struct rpc_task *task) +xprt_rdma_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task) if (!xprt_connected(xprt)) goto drop_connection; + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task) /* An RPC with no reply will throw off credit accounting, * so drop the connection to reset the credit grant. */ - if (!rpc_reply_expected(task)) + if (!rpc_reply_expected(rqst->rq_task)) goto drop_connection; return 0; @@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 956a5ea47b58..3ddba94c939f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + +/** + * rpcrdma_qp_event_handler - Handle one QP event (error notification) + * @event: details of the event + * @context: ep that owns QP where event occurred + * + * Called from the RDMA provider (device driver) possibly in an interrupt + * context. + */ static void -rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); - trace_xprtrdma_qp_error(r_xprt, event); - pr_err("rpcrdma: %s on device %s ep %p\n", - ib_event_msg(event->event), event->device->name, context); + trace_xprtrdma_qp_event(r_xprt, event); + pr_err("rpcrdma: %s on device %s connected to %s:%s\n", + ib_event_msg(event->event), event->device->name, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, rpcrdma_set_max_header_sizes(r_xprt); } +/** + * rpcrdma_cm_event_handler - Handle RDMA CM events + * @id: rdma_cm_id on which an event has occurred + * @event: details of the event + * + * Called with @id's mutex held. Returns 1 if caller should + * destroy @id, otherwise 0. + */ static int -rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *xprt = id->context; - struct rpcrdma_ia *ia = &xprt->rx_ia; - struct rpcrdma_ep *ep = &xprt->rx_ep; - int connstate = 0; + struct rpcrdma_xprt *r_xprt = id->context; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + might_sleep(); - trace_xprtrdma_conn_upcall(xprt, event); + trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: ia->ri_async_rc = 0; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; - xprt_force_disconnect(&xprt->rx_xprt); + xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; @@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->rx_xprt.connect_cookie; - connstate = 1; - rpcrdma_update_connect_private(xprt, &event->param.conn); - goto connected; + ++xprt->connect_cookie; + ep->rep_connected = 1; + rpcrdma_update_connect_private(r_xprt, &event->param.conn); + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: - connstate = -ENOTCONN; - goto connected; + ep->rep_connected = -ENOTCONN; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETUNREACH; - goto connected; + ep->rep_connected = -ENETUNREACH; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - connstate = -ECONNREFUSED; + ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - connstate = -EAGAIN; - goto connected; + ep->rep_connected = -EAGAIN; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->rx_xprt.connect_cookie; - connstate = -ECONNABORTED; -connected: - ep->rep_connected = connstate; - rpcrdma_conn_func(ep); + ++xprt->connect_cookie; + ep->rep_connected = -ECONNABORTED; +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); - /*FALLTHROUGH*/ + break; default: - dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", - __func__, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - ep, rdma_event_msg(event->event)); break; } + dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + rdma_event_msg(event->event)); return 0; } @@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; - ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_sge = max_sge; @@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -615,7 +653,7 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -754,6 +793,8 @@ retry: } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } - buf->rb_flags = 0; return 0; @@ -977,39 +1017,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } } -static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { @@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 3; count++) { + for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; + buf->rb_flags = 0; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); @@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; + rc = 0; + count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (buf->rb_posted_receives > needed) - return; + goto out; needed -= buf->rb_posted_receives; count = 0; @@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --needed; } if (!count) - return; + goto out; rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } } buf->rb_posted_receives += count; +out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ca14f7c2d51..a13ccb643ce0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); @@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6b7539c0466e..ae77c71c1f64 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,13 +47,13 @@ #include #include #include +#include +#include #include #include "sunrpc.h" -#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) - static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport + .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", @@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport, + .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { @@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt) } } +static size_t +xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) +{ + size_t i,n; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return want; + if (want > buf->page_len) + want = buf->page_len; + n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < n; i++) { + if (buf->pages[i]) + continue; + buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); + if (!buf->pages[i]) { + buf->page_len = (i * PAGE_SIZE) - buf->page_base; + return buf->page_len; + } + } + return want; +} + +static ssize_t +xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) +{ + ssize_t ret; + if (seek != 0) + iov_iter_advance(&msg->msg_iter, seek); + ret = sock_recvmsg(sock, msg, flags); + return ret > 0 ? ret + seek : ret; +} + +static ssize_t +xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, + struct kvec *kvec, size_t count, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, READ, kvec, 1, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, + struct bio_vec *bvec, unsigned long nr, size_t count, + size_t seek) +{ + iov_iter_bvec(&msg->msg_iter, READ, bvec, nr, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, + size_t count) +{ + struct kvec kvec = { 0 }; + return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); +} + +static ssize_t +xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, + struct xdr_buf *buf, size_t count, size_t seek, size_t *read) +{ + size_t want, seek_init = seek, offset = 0; + ssize_t ret; + + if (seek < buf->head[0].iov_len) { + want = min_t(size_t, count, buf->head[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->head[0].iov_len; + offset += buf->head[0].iov_len; + } + if (seek < buf->page_len) { + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + ret = xs_read_bvec(sock, msg, flags, buf->bvec, + xdr_buf_pagecount(buf), + want + buf->page_base, + seek + buf->page_base); + if (ret <= 0) + goto sock_err; + offset += ret - buf->page_base; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->page_len; + offset += buf->page_len; + } + if (seek < buf->tail[0].iov_len) { + want = min_t(size_t, count - offset, buf->tail[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + } else + offset += buf->tail[0].iov_len; + ret = -EMSGSIZE; + msg->msg_flags |= MSG_TRUNC; +out: + *read = offset - seek_init; + return ret; +eagain: + ret = -EAGAIN; + goto out; +sock_err: + offset += seek; + goto out; +} + +static void +xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) +{ + if (!transport->recv.copied) { + if (buf->head[0].iov_len >= transport->recv.offset) + memcpy(buf->head[0].iov_base, + &transport->recv.xid, + transport->recv.offset); + transport->recv.copied = transport->recv.offset; + } +} + +static bool +xs_read_stream_request_done(struct sock_xprt *transport) +{ + return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); +} + +static ssize_t +xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, + int flags, struct rpc_rqst *req) +{ + struct xdr_buf *buf = &req->rq_private_buf; + size_t want, read; + ssize_t ret; + + xs_read_header(transport, buf); + + want = transport->recv.len - transport->recv.offset; + ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, + transport->recv.copied + want, transport->recv.copied, + &read); + transport->recv.offset += read; + transport->recv.copied += read; + if (transport->recv.offset == transport->recv.len) { + if (xs_read_stream_request_done(transport)) + msg->msg_flags |= MSG_EOR; + return transport->recv.copied; + } + + switch (ret) { + case -EMSGSIZE: + return transport->recv.copied; + case 0: + return -ESHUTDOWN; + default: + if (ret < 0) + return ret; + } + return -EAGAIN; +} + +static size_t +xs_read_stream_headersize(bool isfrag) +{ + if (isfrag) + return sizeof(__be32); + return 3 * sizeof(__be32); +} + +static ssize_t +xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, + int flags, size_t want, size_t seek) +{ + struct kvec kvec = { + .iov_base = &transport->recv.fraghdr, + .iov_len = want, + }; + return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); +} + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret; + + /* Look up and lock the request corresponding to the given XID */ + req = xprt_lookup_bc_request(xprt, transport->recv.xid); + if (!req) { + printk(KERN_WARNING "Callback slot table overflowed\n"); + return -ESHUTDOWN; + } + + ret = xs_read_stream_request(transport, msg, flags, req); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_bc_request(req, ret); + + return ret; +} +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + return -ESHUTDOWN; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +static ssize_t +xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret = 0; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->queue_lock); + req = xprt_lookup_rqst(xprt, transport->recv.xid); + if (!req) { + msg->msg_flags |= MSG_TRUNC; + goto out; + } + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); + + ret = xs_read_stream_request(transport, msg, flags, req); + + spin_lock(&xprt->queue_lock); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_rqst(req->rq_task, ret); + xprt_unpin_rqst(req); +out: + spin_unlock(&xprt->queue_lock); + return ret; +} + +static ssize_t +xs_read_stream(struct sock_xprt *transport, int flags) +{ + struct msghdr msg = { 0 }; + size_t want, read = 0; + ssize_t ret = 0; + + if (transport->recv.len == 0) { + want = xs_read_stream_headersize(transport->recv.copied != 0); + ret = xs_read_stream_header(transport, &msg, flags, want, + transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset = ret; + if (ret != want) { + ret = -EAGAIN; + goto out_err; + } + transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & + RPC_FRAGMENT_SIZE_MASK; + transport->recv.offset -= sizeof(transport->recv.fraghdr); + read = ret; + } + + switch (be32_to_cpu(transport->recv.calldir)) { + case RPC_CALL: + ret = xs_read_stream_call(transport, &msg, flags); + break; + case RPC_REPLY: + ret = xs_read_stream_reply(transport, &msg, flags); + } + if (msg.msg_flags & MSG_TRUNC) { + transport->recv.calldir = cpu_to_be32(-1); + transport->recv.copied = -1; + } + if (ret < 0) + goto out_err; + read += ret; + if (transport->recv.offset < transport->recv.len) { + ret = xs_read_discard(transport->sock, &msg, flags, + transport->recv.len - transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset += ret; + read += ret; + if (transport->recv.offset != transport->recv.len) + return -EAGAIN; + } + if (xs_read_stream_request_done(transport)) { + trace_xs_stream_read_request(transport); + transport->recv.copied = 0; + } + transport->recv.offset = 0; + transport->recv.len = 0; + return read; +out_err: + switch (ret) { + case 0: + case -ESHUTDOWN: + xprt_force_disconnect(&transport->xprt); + return -ESHUTDOWN; + } + return ret; +} + +static void xs_stream_data_receive(struct sock_xprt *transport) +{ + size_t read = 0; + ssize_t ret = 0; + + mutex_lock(&transport->recv_mutex); + if (transport->sock == NULL) + goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + for (;;) { + ret = xs_read_stream(transport, MSG_DONTWAIT); + if (ret <= 0) + break; + read += ret; + cond_resched(); + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_stream_read_data(&transport->xprt, ret, read); +} + +static void xs_stream_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_stream_data_receive(transport); +} + +static void +xs_stream_reset_connect(struct sock_xprt *transport) +{ + transport->recv.offset = 0; + transport->recv.len = 0; + transport->recv.copied = 0; + transport->xmit.offset = 0; + transport->xprt.stat.connect_count++; + transport->xprt.stat.connect_start = jiffies; +} + #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) @@ -440,28 +796,21 @@ out: return err; } -static void xs_nospace_callback(struct rpc_task *task) -{ - struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); - - transport->inet->sk_write_pending--; -} - /** - * xs_nospace - place task on wait queue if transmit was incomplete - * @task: task to put to sleep + * xs_nospace - handle transmit was incomplete + * @req: pointer to RPC request * */ -static int xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_task->tk_pid, + req->rq_slen - transport->xmit.offset, req->rq_slen); /* Protect against races with write_space */ @@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task) if (xprt_connected(xprt)) { /* wait for more buffer space */ sk->sk_write_pending++; - xprt_wait_for_buffer_space(task, xs_nospace_callback); + xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; @@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task) return ret; } +static void +xs_stream_prepare_request(struct rpc_rqst *req) +{ + req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO); +} + +/* + * Determine if the previous message in the stream was aborted before it + * could complete transmission. + */ +static bool +xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) +{ + return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; +} + /* * Construct a stream transport record marker in @buf. */ @@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket - * @task: RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occured, the request was not sent */ -static int xs_local_send_request(struct rpc_task *task) +static int xs_local_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task) int status; int sent = 0; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + xs_close(xprt); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + status = xs_sendpages(transport->sock, NULL, 0, xdr, + transport->xmit.offset, true, &sent); dprintk("RPC: %s(%u) = %d\n", - __func__, xdr->len - req->rq_bytes_sent, status); + __func__, xdr->len - transport->xmit.offset, status); if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (likely(sent > 0) || status == 0) { - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } status = -EAGAIN; @@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task) case -ENOBUFS: break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task) /** * xs_udp_send_request - write an RPC request to a UDP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ -static int xs_udp_send_request(struct rpc_task *task) +static int xs_udp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + + if (!xprt_request_get_cong(xprt, req)) + return -EBADSLT; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, req->rq_bytes_sent, true, &sent); + xdr, 0, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) @@ -619,7 +995,7 @@ process_status: /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -639,7 +1015,7 @@ process_status: /** * xs_tcp_send_request - write an RPC request to a TCP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -651,9 +1027,8 @@ process_status: * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ -static int xs_tcp_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task) int status; int sent; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + if (transport->sock != NULL) + kernel_sock_shutdown(transport->sock, SHUT_RDWR); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", @@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task) * completes while the socket holds a reference to the pages, * then we may end up resending corrupted data. */ - if (task->tk_flags & RPC_TASK_SENT) + if (req->rq_task->tk_flags & RPC_TASK_SENT) zerocopy = false; if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) @@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task) while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, - req->rq_bytes_sent, zerocopy, &sent); + transport->xmit.offset, + zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } @@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ECONNRESET: case -ECONNREFUSED: @@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task) return status; } -/** - * xs_tcp_release_xprt - clean up after a tcp transmission - * @xprt: transport - * @task: rpc task - * - * This cleans up if an error causes us to abort the transmission of a request. - * In this case, the socket may need to be reset in order to avoid confusing - * the server. - */ -static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpc_rqst *req; - - if (task != xprt->snd_task) - return; - if (task == NULL) - goto out_release; - req = task->tk_rqstp; - if (req == NULL) - goto out_release; - if (req->rq_bytes_sent == 0) - goto out_release; - if (req->rq_bytes_sent == req->rq_snd_buf.len) - goto out_release; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); -out_release: - xprt_release_xprt(xprt, task); -} - static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; @@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - struct xdr_skb_reader desc = { - .skb = skb, - .offset = sizeof(rpc_fraghdr), - .count = skb->len - sizeof(rpc_fraghdr), - }; - - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - -/** - * xs_local_data_read_skb - * @xprt: transport - * @sk: socket - * @skb: skbuff - * - * Currently this assumes we can read the whole reply in a single gulp. - */ -static void xs_local_data_read_skb(struct rpc_xprt *xprt, - struct sock *sk, - struct sk_buff *skb) -{ - struct rpc_task *task; - struct rpc_rqst *rovr; - int repsize, copied; - u32 _xid; - __be32 *xp; - - repsize = skb->len - sizeof(rpc_fraghdr); - if (repsize < 4) { - dprintk("RPC: impossible RPC reply size %d\n", repsize); - return; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); - if (xp == NULL) - return; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - xprt_pin_rqst(rovr); - spin_unlock(&xprt->recv_lock); - task = rovr->rq_task; - - copied = rovr->rq_private_buf.buflen; - if (copied > repsize) - copied = repsize; - - if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { - dprintk("RPC: sk_buff copy failed\n"); - spin_lock(&xprt->recv_lock); - goto out_unpin; - } - - spin_lock(&xprt->recv_lock); - xprt_complete_rqst(task, copied); -out_unpin: - xprt_unpin_rqst(rovr); - out_unlock: - spin_unlock(&xprt->recv_lock); -} - -static void xs_local_data_receive(struct sock_xprt *transport) -{ - struct sk_buff *skb; - struct sock *sk; - int err; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - for (;;) { - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb != NULL) { - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } -out: - mutex_unlock(&transport->recv_mutex); -} - -static void xs_local_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_local_data_receive(transport); -} - /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport @@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } @@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock_bh(&xprt->transport_lock); - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; -restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); - if (skb != NULL) { - xs_udp_data_read_skb(&transport->xprt, sk, skb); - consume_skb(skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + if (skb == NULL) break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } + xs_udp_data_read_skb(&transport->xprt, sk, skb); + consume_skb(skb); + cond_resched(); } out: mutex_unlock(&transport->recv_mutex); @@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) xprt_force_disconnect(xprt); } -static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - size_t len, used; - char *p; - - p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset; - len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - - transport->tcp_reclen = ntohl(transport->tcp_fraghdr); - if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) - transport->tcp_flags |= TCP_RCV_LAST_FRAG; - else - transport->tcp_flags &= ~TCP_RCV_LAST_FRAG; - transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; - - transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - - /* Sanity check of the record length */ - if (unlikely(transport->tcp_reclen < 8)) { - dprintk("RPC: invalid TCP record fragment length\n"); - xs_tcp_force_close(xprt); - return; - } - dprintk("RPC: reading TCP record fragment of length %d\n", - transport->tcp_reclen); -} - -static void xs_tcp_check_fraghdr(struct sock_xprt *transport) -{ - if (transport->tcp_offset == transport->tcp_reclen) { - transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) { - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RCV_COPY_XID; - transport->tcp_copied = 0; - } - } -} - -static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len, used; - char *p; - - len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%zu bytes)\n", len); - p = ((char *) &transport->tcp_xid) + transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_COPY_XID; - transport->tcp_flags |= TCP_RCV_READ_CALLDIR; - transport->tcp_copied = 4; - dprintk("RPC: reading %s XID %08x\n", - (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for" - : "request with", - ntohl(transport->tcp_xid)); - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_calldir(struct sock_xprt *transport, - struct xdr_skb_reader *desc) -{ - size_t len, used; - u32 offset; - char *p; - - /* - * We want transport->tcp_offset to be 8 at the end of this routine - * (4 bytes for the xid and 4 bytes for the call/reply flag). - * When this function is called for the first time, - * transport->tcp_offset is 4 (after having already read the xid). - */ - offset = transport->tcp_offset - sizeof(transport->tcp_xid); - len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); - p = ((char *) &transport->tcp_calldir) + offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR; - /* - * We don't yet have the XDR buffer, so we will write the calldir - * out after we get the buffer from the 'struct rpc_rqst' - */ - switch (ntohl(transport->tcp_calldir)) { - case RPC_REPLY: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RPC_REPLY; - break; - case RPC_CALL: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags &= ~TCP_RPC_REPLY; - break; - default: - dprintk("RPC: invalid request message type\n"); - xs_tcp_force_close(&transport->xprt); - } - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_common(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc, - struct rpc_rqst *req) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - rcvbuf = &req->rq_private_buf; - - if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) { - /* - * Save the RPC direction in the XDR buffer - */ - memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied, - &transport->tcp_calldir, - sizeof(transport->tcp_calldir)); - transport->tcp_copied += sizeof(transport->tcp_calldir); - transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR; - } - - len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) - desc->count = transport->tcp_reclen - transport->tcp_offset; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - desc, xdr_skb_read_bits); - - if (desc->count) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off TCP_RCV_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(transport->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, " - "tcp_offset = %u, tcp_reclen = %u\n", - xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - return; - } - - transport->tcp_copied += r; - transport->tcp_offset += r; - desc->count = len - r; - - dprintk("RPC: XID %08x read %zd bytes\n", - ntohl(transport->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " - "tcp_reclen = %u\n", xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - - if (transport->tcp_copied == req->rq_private_buf.buflen) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - else if (transport->tcp_offset == transport->tcp_reclen) { - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -/* - * Finds the request corresponding to the RPC xid and invokes the common - * tcp read code to read the data. - */ -static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->recv_lock); - req = xprt_lookup_rqst(xprt, transport->tcp_xid); - if (!req) { - dprintk("RPC: XID %08x request not found!\n", - ntohl(transport->tcp_xid)); - spin_unlock(&xprt->recv_lock); - return -1; - } - xprt_pin_rqst(req); - spin_unlock(&xprt->recv_lock); - - xs_tcp_read_common(xprt, desc, req); - - spin_lock(&xprt->recv_lock); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_rqst(req->rq_task, transport->tcp_copied); - xprt_unpin_rqst(req); - spin_unlock(&xprt->recv_lock); - return 0; -} - #if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Obtains an rpc_rqst previously allocated and invokes the common - * tcp read code to read the data. The result is placed in the callback - * queue. - * If we're unable to obtain the rpc_rqst we schedule the closing of the - * connection and return -1. - */ -static int xs_tcp_read_callback(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - /* Look up the request corresponding to the given XID */ - req = xprt_lookup_bc_request(xprt, transport->tcp_xid); - if (req == NULL) { - printk(KERN_WARNING "Callback slot table overflowed\n"); - xprt_force_disconnect(xprt); - return -1; - } - - dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); - xs_tcp_read_common(xprt, desc, req); - - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_bc_request(req, transport->tcp_copied); - - return 0; -} - -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - return (transport->tcp_flags & TCP_RPC_REPLY) ? - xs_tcp_read_reply(xprt, desc) : - xs_tcp_read_callback(xprt, desc); -} - static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) { int ret; @@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } -#else -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - return xs_tcp_read_reply(xprt, desc); -} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Read data off the transport. This can be either an RPC_CALL or an - * RPC_REPLY. Relay the processing to helper functions. - */ -static void xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - if (_xs_tcp_read_data(xprt, desc) == 0) - xs_tcp_check_fraghdr(transport); - else { - /* - * The transport_lock protects the request handling. - * There's no need to hold it to update the tcp_flags. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len; - - len = transport->tcp_reclen - transport->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - transport->tcp_offset += len; - dprintk("RPC: discarded %zu bytes\n", len); - xs_tcp_check_fraghdr(transport); -} - -static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct xdr_skb_reader desc = { - .skb = skb, - .offset = offset, - .count = len, - }; - size_t ret; - - dprintk("RPC: xs_tcp_data_recv started\n"); - do { - trace_xs_tcp_data_recv(transport); - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) { - xs_tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (transport->tcp_flags & TCP_RCV_COPY_XID) { - xs_tcp_read_xid(transport, &desc); - continue; - } - /* Read in the call/reply flag */ - if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) { - xs_tcp_read_calldir(transport, &desc); - continue; - } - /* Read in the request data */ - if (transport->tcp_flags & TCP_RCV_COPY_DATA) { - xs_tcp_read_data(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - xs_tcp_read_discard(transport, &desc); - } while (desc.count); - ret = len - desc.count; - if (ret < rd_desc->count) - rd_desc->count -= ret; - else - rd_desc->count = 0; - trace_xs_tcp_data_recv(transport); - dprintk("RPC: xs_tcp_data_recv done\n"); - return ret; -} - -static void xs_tcp_data_receive(struct sock_xprt *transport) -{ - struct rpc_xprt *xprt = &transport->xprt; - struct sock *sk; - read_descriptor_t rd_desc = { - .arg.data = xprt, - }; - unsigned long total = 0; - int read = 0; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { - rd_desc.count = RPC_TCP_READ_CHUNK_SZ; - lock_sock(sk); - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (rd_desc.count != 0 || read < 0) { - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); - release_sock(sk); - break; - } - release_sock(sk); - total += read; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } - if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - queue_work(xprtiod_workqueue, &transport->recv_worker); -out: - mutex_unlock(&transport->recv_mutex); - trace_xs_tcp_data_ready(xprt, read, total); -} - -static void xs_tcp_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_tcp_data_receive(transport); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - - /* Reset TCP record info */ - transport->tcp_offset = 0; - transport->tcp_reclen = 0; - transport->tcp_copied = 0; - transport->tcp_flags = - TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk) if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) goto out; - xprt_write_space(xprt); + if (xprt_write_space(xprt)) + sk->sk_write_pending--; out: rcu_read_unlock(); } @@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } -static unsigned short xs_get_random_port(void) +static int xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; - unsigned short rand = (unsigned short) prandom_u32() % range; - return rand + xprt_min_resvport; + unsigned short min = xprt_min_resvport, max = xprt_max_resvport; + unsigned short range; + unsigned short rand; + + if (max < min) + return -EADDRINUSE; + range = max - min + 1; + rand = (unsigned short) prandom_u32() % range; + return rand + min; } /** @@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) transport->srcport = xs_sock_getport(sock); } -static unsigned short xs_get_srcport(struct sock_xprt *transport) +static int xs_get_srcport(struct sock_xprt *transport) { - unsigned short port = transport->srcport; + int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); @@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; - unsigned short port = xs_get_srcport(transport); + int port = xs_get_srcport(transport); unsigned short last; /* @@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ - if (port == 0) - return 0; + if (port <= 0) + return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { @@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, write_unlock_bh(&sk->sk_callback_lock); } - /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; + xs_stream_reset_connect(transport); + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); + /* Reset TCP record info */ + xs_stream_reset_connect(transport); + /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { @@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req) /* * The send routine. Borrows from svc_send */ -static int bc_send_request(struct rpc_task *task) +static int bc_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; int len; @@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task) * Grab the mutex to serialize data as the connection is shared * with the fore channel */ - if (!mutex_trylock(&xprt->xpt_mutex)) { - rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&xprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); - } + mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else @@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, + .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, @@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, @@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = { static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, - .alloc_slot = xprt_lock_and_alloc_slot, + .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, @@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; - INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_dummy_setup_socket); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: @@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); - INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { @@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - if (kp->arg == &xprt_min_resvport) - return param_set_uint_minmax(val, kp, - RPC_MIN_RESVPORT, - xprt_max_resvport); return param_set_uint_minmax(val, kp, - xprt_min_resvport, + RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 418f03d0be90..e65c3a8551e4 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -577,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_dereference_rtnl(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { - skb->next = NULL; + skb_mark_not_on_list(skb); tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; @@ -609,16 +609,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, switch (evt) { case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) + if (netif_carrier_ok(dev) && netif_oper_up(dev)) { + test_and_set_bit_lock(0, &b->up); break; - /* else: fall through */ - case NETDEV_UP: - test_and_set_bit_lock(0, &b->up); - break; + } + /* fall through */ case NETDEV_GOING_DOWN: clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; + case NETDEV_UP: + test_and_set_bit_lock(0, &b->up); + break; case NETDEV_CHANGEMTU: if (tipc_mtu_bad(dev, 0)) { bearer_disable(net, b); diff --git a/net/tipc/group.c b/net/tipc/group.c index e82f13cb2dc5..06fee142f09f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -666,6 +666,7 @@ static void tipc_group_create_event(struct tipc_group *grp, struct sk_buff *skb; struct tipc_msg *hdr; + memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; diff --git a/net/tipc/link.c b/net/tipc/link.c index b1f0bee54eac..201c3b5bc96b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } +u32 tipc_link_state(struct tipc_link *l) +{ + return l->state; +} + /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -472,6 +477,8 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->in_session = false; l->bearer_id = bearer_id; l->tolerance = tolerance; + if (bc_rcvlink) + bc_rcvlink->tolerance = tolerance; l->net_plane = net_plane; l->advertised_mtu = mtu; l->mtu = mtu; @@ -838,12 +845,24 @@ static void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { + struct sk_buff_head list; + + __skb_queue_head_init(&list); + l->in_session = false; l->session++; l->mtu = l->advertised_mtu; + + spin_lock_bh(&l->wakeupq.lock); + skb_queue_splice_init(&l->wakeupq, &list); + spin_unlock_bh(&l->wakeupq.lock); + + spin_lock_bh(&l->inputq->lock); + skb_queue_splice_init(&list, l->inputq); + spin_unlock_bh(&l->inputq->lock); + __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); - skb_queue_splice_init(&l->wakeupq, l->inputq); __skb_queue_purge(&l->backlogq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; @@ -1021,7 +1040,8 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, /* Detect repeated retransmit failures on same packet */ if (r->last_retransm != buf_seqno(skb)) { r->last_retransm = buf_seqno(skb); - r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance); + r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); if (link_is_bc_sndlink(l)) @@ -1380,6 +1400,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, __skb_queue_tail(xmitq, skb); } +void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + u32 onode = tipc_own_addr(l->net); + struct tipc_msg *hdr, *ihdr; + struct sk_buff_head tnlq; + struct sk_buff *skb; + u32 dnode = l->addr; + + skb_queue_head_init(&tnlq); + skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, + INT_H_SIZE, BASIC_H_SIZE, + dnode, onode, 0, 0, 0); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + + hdr = buf_msg(skb); + msg_set_msgcnt(hdr, 1); + msg_set_bearer_id(hdr, l->peer_bearer_id); + + ihdr = (struct tipc_msg *)msg_data(hdr); + tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, dnode); + msg_set_errcode(ihdr, TIPC_ERR_NO_PORT); + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, xmitq); +} + /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets * with contents of the link's transmit and backlog queues. */ @@ -1476,6 +1526,9 @@ bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) return false; if (session != curr_session) return false; + /* Extra sanity check */ + if (!link_is_up(l) && msg_ack(hdr)) + return false; if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) return true; /* Accept only STATE with new sequence number */ @@ -1533,9 +1586,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; - + l->bc_rcvlink->tolerance = peers_tol; + } /* Update own priority if peer's priority is higher */ if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; @@ -1561,9 +1615,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; - + l->bc_rcvlink->tolerance = peers_tol; + } /* Update own prio if peer indicates a different value */ if ((peers_prio != l->priority) && in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { @@ -2180,6 +2235,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; + if (l->bc_rcvlink) + l->bc_rcvlink->tolerance = tol; if (link_is_up(l)) tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } diff --git a/net/tipc/link.h b/net/tipc/link.h index 7bc494a33fdf..90488c538a4e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link **link); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); +void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl, + struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); bool tipc_link_is_up(struct tipc_link *l); @@ -107,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); +u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b61891054709..f48e5857210f 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -499,54 +499,56 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, /** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message - * @skb: buffer containing message to be reversed; may be replaced. + * @skb: buffer containing message to be reversed; will be consumed * @err: error code to be set in message, if any - * Consumes buffer at failure + * Replaces consumed buffer with new one when successful * Returns true if success, otherwise false */ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; - struct tipc_msg *hdr; - struct tipc_msg ohdr; - int dlen; + struct tipc_msg *_hdr, *hdr; + int hlen, dlen; if (skb_linearize(_skb)) goto exit; - hdr = buf_msg(_skb); - dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); - if (msg_dest_droppable(hdr)) + _hdr = buf_msg(_skb); + dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE); + hlen = msg_hdr_sz(_hdr); + + if (msg_dest_droppable(_hdr)) goto exit; - if (msg_errcode(hdr)) + if (msg_errcode(_hdr)) goto exit; - /* Take a copy of original header before altering message */ - memcpy(&ohdr, hdr, msg_hdr_sz(hdr)); - - /* Never return SHORT header; expand by replacing buffer if necessary */ - if (msg_short(hdr)) { - *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC); - if (!*skb) - goto exit; - memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); - kfree_skb(_skb); - _skb = *skb; - hdr = buf_msg(_skb); - memcpy(hdr, &ohdr, BASIC_H_SIZE); - msg_set_hdr_sz(hdr, BASIC_H_SIZE); - } + /* Never return SHORT header */ + if (hlen == SHORT_H_SIZE) + hlen = BASIC_H_SIZE; + + /* Don't return data along with SYN+, - sender has a clone */ + if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD) + dlen = 0; + + /* Allocate new buffer to return */ + *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC); + if (!*skb) + goto exit; + memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr)); + memcpy((*skb)->data + hlen, msg_data(_hdr), dlen); - /* Now reverse the concerned fields */ + /* Build reverse header in new buffer */ + hdr = buf_msg(*skb); + msg_set_hdr_sz(hdr, hlen); msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); - msg_set_origport(hdr, msg_destport(&ohdr)); - msg_set_destport(hdr, msg_origport(&ohdr)); - msg_set_destnode(hdr, msg_prevnode(&ohdr)); + msg_set_origport(hdr, msg_destport(_hdr)); + msg_set_destport(hdr, msg_origport(_hdr)); + msg_set_destnode(hdr, msg_prevnode(_hdr)); msg_set_prevnode(hdr, own_node); msg_set_orignode(hdr, own_node); - msg_set_size(hdr, msg_hdr_sz(hdr) + dlen); - skb_trim(_skb, msg_size(hdr)); + msg_set_size(hdr, hlen + dlen); skb_orphan(_skb); + kfree_skb(_skb); return true; exit: kfree_skb(_skb); @@ -554,6 +556,22 @@ exit: return false; } +bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) +{ + struct sk_buff *skb, *_skb; + + skb_queue_walk(msg, skb) { + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) { + __skb_queue_purge(cpy); + pr_err_ratelimited("Failed to clone buffer chain\n"); + return false; + } + __skb_queue_tail(cpy, _skb); + } + return true; +} + /** * tipc_msg_lookup_dest(): try to find new destination for named message * @skb: the buffer containing the message. diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a4e944d59394..a2879e6ec5b6 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -216,6 +216,16 @@ static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) msg_set_bits(m, 0, 20, 1, n); } +static inline int msg_is_syn(struct tipc_msg *m) +{ + return msg_bits(m, 0, 17, 1); +} + +static inline void msg_set_syn(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 17, 1, d); +} + static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); @@ -970,6 +980,7 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); +bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 51b4b96f89db..61219f0b9677 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -94,8 +94,9 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) list_add_tail_rcu(&publ->binding_node, &nt->node_scope); return NULL; } - list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope); - + write_lock_bh(&nt->cluster_scope_lock); + list_add_tail(&publ->binding_node, &nt->cluster_scope); + write_unlock_bh(&nt->cluster_scope_lock); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { pr_warn("Publication distribution failure\n"); @@ -112,11 +113,13 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { + struct name_table *nt = tipc_name_table(net); struct sk_buff *buf; struct distr_item *item; + write_lock_bh(&nt->cluster_scope_lock); list_del(&publ->binding_node); - + write_unlock_bh(&nt->cluster_scope_lock); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -189,11 +192,10 @@ void tipc_named_node_up(struct net *net, u32 dnode) __skb_queue_head_init(&head); - rcu_read_lock(); + read_lock_bh(&nt->cluster_scope_lock); named_distribute(net, &head, dnode, &nt->cluster_scope); - rcu_read_unlock(); - tipc_node_xmit(net, &head, dnode, 0); + read_unlock_bh(&nt->cluster_scope_lock); } /** diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66d5b2c5987a..bff241f03525 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -744,6 +744,7 @@ int tipc_nametbl_init(struct net *net) INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); + rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 892bd750b85f..f79066334cc8 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -100,6 +100,7 @@ struct name_table { struct hlist_head services[TIPC_NAMETBL_SIZE]; struct list_head node_scope; struct list_head cluster_scope; + rwlock_t cluster_scope_lock; u32 local_publ_count; }; diff --git a/net/tipc/node.c b/net/tipc/node.c index 68014f1b6976..2afc4f8c37a7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,6 +111,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; @@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); + n->failover_sent = false; n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); @@ -911,6 +913,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool reset = true; char *if_name; unsigned long intv; + u16 session; *dupl_addr = false; *respond = false; @@ -997,9 +1000,10 @@ void tipc_node_check_dest(struct net *net, u32 addr, goto exit; if_name = strchr(b->name, ':') + 1; + get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, mod(tipc_net(net)->random), + b->window, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -1615,6 +1619,14 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } + /* If parallel link was already down, and this happened before + * the tunnel link came up, FAILOVER was never sent. Ensure that + * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. + */ + if (n->state != NODE_FAILINGOVER && !n->failover_sent) { + tipc_link_create_dummy_tnl_msg(l, xmitq); + n->failover_sent = true; + } /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; diff --git a/net/tipc/node.h b/net/tipc/node.h index 48b3298a248d..03f5efb62cfb 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,6 +45,7 @@ /* Optional capabilities supported by this code version */ enum { + TIPC_SYN_BIT = (1), TIPC_BCAST_SYNCH = (1 << 1), TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), @@ -53,11 +54,12 @@ enum { TIPC_LINK_PROTO_SEQNO = (1 << 6) }; -#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ - TIPC_BCAST_STATE_NACK | \ - TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID128 | \ +#define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ + TIPC_BCAST_SYNCH | \ + TIPC_BCAST_STATE_NACK | \ + TIPC_BCAST_RCAST | \ + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID128 | \ TIPC_LINK_PROTO_SEQNO) #define INVALID_BEARER_ID -1 diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..636e6131769d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -47,7 +47,7 @@ #include "netlink.h" #include "group.h" -#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff @@ -80,7 +80,6 @@ struct sockaddr_pair { * @publications: list of publications for port * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime - * @probing_state: * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links @@ -102,8 +101,8 @@ struct tipc_sock { struct list_head cong_links; struct list_head publications; u32 pub_count; - uint conn_timeout; atomic_t dupl_rcvcnt; + u16 conn_timeout; bool probe_unacked; u16 cong_link_cnt; u16 snt_unacked; @@ -507,6 +506,9 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); + /* Remove any pending SYN message */ + __skb_queue_purge(&sk->sk_write_queue); + /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). */ @@ -715,7 +717,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; @@ -1196,6 +1198,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, * @skb: pointer to message buffer. */ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); @@ -1213,7 +1216,16 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), tsk_peer_port(tsk)); sk->sk_state_change(sk); - goto exit; + + /* State change is ignored if socket already awake, + * - convert msg to abort msg and add to inqueue + */ + msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); + msg_set_type(hdr, TIPC_CONN_MSG); + msg_set_size(hdr, BASIC_H_SIZE); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); + __skb_queue_tail(inputq, skb); + return; } tsk->probe_unacked = false; @@ -1319,6 +1331,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } + msg_set_syn(hdr, 1); } seq = &dest->addr.nameseq; @@ -1361,6 +1374,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; + if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) + return -ENOMEM; rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { @@ -1419,8 +1434,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); - if (dlen && (dlen == rc)) + if (dlen && dlen == rc) { + tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); + } return rc; } @@ -1478,6 +1495,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, struct net *net = sock_net(sk); struct tipc_msg *msg = &tsk->phdr; + msg_set_syn(msg, 0); msg_set_destnode(msg, peer_node); msg_set_destport(msg, peer_port); msg_set_type(msg, TIPC_CONN_MSG); @@ -1489,6 +1507,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + __skb_queue_purge(&sk->sk_write_queue); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; @@ -1934,7 +1953,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, switch (msg_user(hdr)) { case CONN_MANAGER: - tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq); return; case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); @@ -1959,91 +1978,90 @@ static void tipc_sk_proto_rcv(struct sock *sk, } /** - * tipc_filter_connect - Handle incoming message for a connection-based socket + * tipc_sk_filter_connect - check incoming message for a connection-based socket * @tsk: TIPC socket - * @skb: pointer to message buffer. Set to NULL if buffer is consumed - * - * Returns true if everything ok, false otherwise + * @skb: pointer to message buffer. + * Returns true if message should be added to receive queue, false otherwise */ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *hdr = buf_msg(skb); - u32 pport = msg_origport(hdr); - u32 pnode = msg_orignode(hdr); + bool con_msg = msg_connected(hdr); + u32 pport = tsk_peer_port(tsk); + u32 pnode = tsk_peer_node(tsk); + u32 oport = msg_origport(hdr); + u32 onode = msg_orignode(hdr); + int err = msg_errcode(hdr); + unsigned long delay; if (unlikely(msg_mcast(hdr))) return false; switch (sk->sk_state) { case TIPC_CONNECTING: - /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(hdr))) { - if (pport != tsk_peer_port(tsk) || - pnode != tsk_peer_node(tsk)) - return false; - - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = ECONNREFUSED; - sk->sk_state_change(sk); - return true; - } - - if (unlikely(msg_errcode(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = ECONNREFUSED; - sk->sk_state_change(sk); - return true; - } - - if (unlikely(!msg_isdata(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - sk->sk_err = EINVAL; - sk->sk_state_change(sk); - return true; + /* Setup ACK */ + if (likely(con_msg)) { + if (err) + break; + tipc_sk_finish_conn(tsk, oport, onode); + msg_set_importance(&tsk->phdr, msg_importance(hdr)); + /* ACK+ message with data is added to receive queue */ + if (msg_data_sz(hdr)) + return true; + /* Empty ACK-, - wake up sleeping connect() and drop */ + sk->sk_data_ready(sk); + msg_set_dest_droppable(hdr, 1); + return false; } + /* Ignore connectionless message if not from listening socket */ + if (oport != pport || onode != pnode) + return false; - tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); - msg_set_importance(&tsk->phdr, msg_importance(hdr)); - - /* If 'ACK+' message, add to socket receive queue */ - if (msg_data_sz(hdr)) - return true; - - /* If empty 'ACK-' message, wake up sleeping connect() */ - sk->sk_data_ready(sk); + /* Rejected SYN */ + if (err != TIPC_ERR_OVERLOAD) + break; - /* 'ACK-' message is neither accepted nor rejected: */ - msg_set_dest_droppable(hdr, 1); + /* Prepare for new setup attempt if we have a SYN clone */ + if (skb_queue_empty(&sk->sk_write_queue)) + break; + get_random_bytes(&delay, 2); + delay %= (tsk->conn_timeout / 4); + delay = msecs_to_jiffies(delay + 100); + sk_reset_timer(sk, &sk->sk_timer, jiffies + delay); return false; - case TIPC_OPEN: case TIPC_DISCONNECTING: - break; + return false; case TIPC_LISTEN: /* Accept only SYN message */ - if (!msg_connected(hdr) && !(msg_errcode(hdr))) + if (!msg_is_syn(hdr) && + tipc_node_get_capabilities(net, onode) & TIPC_SYN_BIT) + return false; + if (!con_msg && !err) return true; - break; + return false; case TIPC_ESTABLISHED: /* Accept only connection-based messages sent by peer */ - if (unlikely(!tsk_peer_msg(tsk, hdr))) + if (likely(con_msg && !err && pport == oport && pnode == onode)) + return true; + if (!tsk_peer_msg(tsk, hdr)) return false; - - if (unlikely(msg_errcode(hdr))) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - /* Let timer expire on it's own */ - tipc_node_remove_conn(net, tsk_peer_node(tsk), - tsk->portid); - sk->sk_state_change(sk); - } + if (!err) + return true; + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, pnode, tsk->portid); + sk->sk_state_change(sk); return true; default: pr_err("Unknown sk_state %u\n", sk->sk_state); } - - return false; + /* Abort connection setup attempt */ + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); + return true; } /** @@ -2545,43 +2563,78 @@ static int tipc_shutdown(struct socket *sock, int how) return res; } +static void tipc_sk_check_probing_state(struct sock *sk, + struct sk_buff_head *list) +{ + struct tipc_sock *tsk = tipc_sk(sk); + u32 pnode = tsk_peer_node(tsk); + u32 pport = tsk_peer_port(tsk); + u32 self = tsk_own_node(tsk); + u32 oport = tsk->portid; + struct sk_buff *skb; + + if (tsk->probe_unacked) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNABORTED; + tipc_node_remove_conn(sock_net(sk), pnode, pport); + sk->sk_state_change(sk); + return; + } + /* Prepare new probe */ + skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, + pnode, self, pport, oport, TIPC_OK); + if (skb) + __skb_queue_tail(list, skb); + tsk->probe_unacked = true; + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); +} + +static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list) +{ + struct tipc_sock *tsk = tipc_sk(sk); + + /* Try again later if dest link is congested */ + if (tsk->cong_link_cnt) { + sk_reset_timer(sk, &sk->sk_timer, msecs_to_jiffies(100)); + return; + } + /* Prepare SYN for retransmit */ + tipc_msg_skb_clone(&sk->sk_write_queue, list); +} + static void tipc_sk_timeout(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct tipc_sock *tsk = tipc_sk(sk); - u32 peer_port = tsk_peer_port(tsk); - u32 peer_node = tsk_peer_node(tsk); - u32 own_node = tsk_own_node(tsk); - u32 own_port = tsk->portid; - struct net *net = sock_net(sk); - struct sk_buff *skb = NULL; + u32 pnode = tsk_peer_node(tsk); + struct sk_buff_head list; + int rc = 0; + skb_queue_head_init(&list); bh_lock_sock(sk); - if (!tipc_sk_connected(sk)) - goto exit; /* Try again later if socket is busy */ if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); - goto exit; + bh_unlock_sock(sk); + return; } - if (tsk->probe_unacked) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, peer_node, peer_port); - sk->sk_state_change(sk); - goto exit; - } - /* Send new probe */ - skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, - peer_node, own_node, peer_port, own_port, - TIPC_OK); - tsk->probe_unacked = true; - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); -exit: + if (sk->sk_state == TIPC_ESTABLISHED) + tipc_sk_check_probing_state(sk, &list); + else if (sk->sk_state == TIPC_CONNECTING) + tipc_sk_retry_connect(sk, &list); + bh_unlock_sock(sk); - if (skb) - tipc_node_xmit_skb(net, skb, peer_node, own_port); + + if (!skb_queue_empty(&list)) + rc = tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid); + + /* SYN messages may cause link congestion */ + if (rc == -ELINKCONG) { + tipc_dest_push(&tsk->cong_links, pnode, 0); + tsk->cong_link_cnt = 1; + } sock_put(sk); } diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 2627b5d812e9..efb16f69bd2c 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -57,16 +57,12 @@ * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry * @net: network namspace instance - * @rcvbuf_cache: memory cache of server receive buffer + * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue * @max_rcvbuf_size: maximum permitted receive message length - * @tipc_conn_new: callback will be called when new connection is incoming - * @tipc_conn_release: callback will be called before releasing the connection - * @tipc_conn_recvmsg: callback will be called when message arrives + * @listener: topsrv listener socket * @name: server name - * @imp: message importance - * @type: socket type */ struct tipc_topsrv { struct idr conn_idr; @@ -90,9 +86,7 @@ struct tipc_topsrv { * @server: pointer to connected server * @sub_list: lsit to all pertaing subscriptions * @sub_lock: lock protecting the subscription list - * @outqueue_lock: control access to the outqueue * @rwork: receive work item - * @rx_action: what to do when connection socket is active * @outqueue: pointer to first outbound message in queue * @outqueue_lock: control access to the outqueue * @swork: send work item @@ -400,7 +394,7 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) iov.iov_base = &s; iov.iov_len = sizeof(s); msg.msg_name = NULL; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; @@ -657,7 +651,7 @@ int tipc_topsrv_start(struct net *net) srv->max_rcvbuf_size = sizeof(struct tipc_subscr); INIT_WORK(&srv->awork, tipc_topsrv_accept); - strncpy(srv->name, name, strlen(name) + 1); + strscpy(srv->name, name, sizeof(srv->name)); tn->topsrv = srv; atomic_set(&tn->subscription_count, 0); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 9783101bc4a9..10dc59ce9c82 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -650,6 +650,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; u8 node_id[NODE_ID_LEN] = {0,}; + int rmcast = 0; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) @@ -680,6 +681,9 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto err; + /* Checking remote ip address */ + rmcast = tipc_udp_is_mcast_addr(&remote); + /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net)) { memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); @@ -705,7 +709,12 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, goto err; } udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + + /* Switch to use ANY to receive packets from group */ + if (rmcast) + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + else + udp_conf.local_ip.s_addr = local.ipv4.s_addr; udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; if (tipc_mtu_bad(dev, sizeof(struct iphdr) + @@ -719,7 +728,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; - udp_conf.local_ip6 = in6addr_any; + if (rmcast) + udp_conf.local_ip6 = in6addr_any; + else + udp_conf.local_ip6 = local.ipv6; b->mtu = 1280; #endif } else { @@ -741,7 +753,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, * is used if it's a multicast address. */ memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); - if (tipc_udp_is_mcast_addr(&remote)) + if (rmcast) err = enable_mcast(ub, &remote); else err = tipc_udp_rcast_add(b, &remote); diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 73f05ece53d0..99c1a19c17b1 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -8,6 +8,7 @@ config TLS select CRYPTO_AES select CRYPTO_GCM select STREAM_PARSER + select NET_SOCK_MSG default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 961b07d4d41c..d753e362d2d9 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -421,7 +421,7 @@ last_record: tls_push_record_flags = flags; if (more) { tls_ctx->pending_open_record_frags = - record->num_frags; + !!record->num_frags; break; } @@ -489,7 +489,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, iov.iov_base = kaddr + offset; iov.iov_len = size; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); rc = tls_push_data(sk, &msg_iter, size, flags, TLS_RECORD_TYPE_DATA); kunmap(page); @@ -538,7 +538,7 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter msg_iter; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 523622dc74f8..311cec8e533d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -141,7 +141,6 @@ retry: size = sg->length; } - clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); ctx->in_tcp_sendpages = false; ctx->sk_write_space(sk); @@ -193,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, return rc; } -int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, - int flags, long *timeo) +int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, + int flags) { struct scatterlist *sg; u16 offset; - if (!tls_is_partially_sent_record(ctx)) - return ctx->push_pending_record(sk, flags); - sg = ctx->partially_sent_record; offset = ctx->partially_sent_offset; @@ -209,9 +205,23 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } +int tls_push_pending_closed_record(struct sock *sk, + struct tls_context *tls_ctx, + int flags, long *timeo) +{ + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (tls_is_partially_sent_record(tls_ctx) || + !list_empty(&ctx->tx_list)) + return tls_tx_records(sk, flags); + else + return tls_ctx->push_pending_record(sk, flags); +} + static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* If in_tcp_sendpages call lower protocol write space handler * to ensure we wake up any waiting operations there. For example @@ -222,20 +232,11 @@ static void tls_write_space(struct sock *sk) return; } - if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { - gfp_t sk_allocation = sk->sk_allocation; - int rc; - long timeo = 0; - - sk->sk_allocation = GFP_ATOMIC; - rc = tls_push_pending_closed_record(sk, ctx, - MSG_DONTWAIT | - MSG_NOSIGNAL, - &timeo); - sk->sk_allocation = sk_allocation; - - if (rc < 0) - return; + /* Schedule the transmission if tx list is ready */ + if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) { + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) + schedule_delayed_work(&tx_ctx->tx_work.work, 0); } ctx->sk_write_space(sk); @@ -270,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); - if (ctx->partially_sent_record) { - struct scatterlist *sg = ctx->partially_sent_record; - - while (1) { - put_page(sg_page(sg)); - sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; - } - } - /* We need these for tls_sw_fallback handling of other packets */ if (ctx->tx_conf == TLS_SW) { kfree(ctx->tx.rec_seq); @@ -632,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; - prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; - prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; + prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; @@ -725,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device); static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", - .uid = TCP_ULP_TLS, - .user_visible = true, .owner = THIS_MODULE, .init = tls_init, }; @@ -736,7 +724,6 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index b9c6ecfbcfea..7b1af8b59cd2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -4,6 +4,7 @@ * Copyright (c) 2016-2017, Lance Chao . All rights reserved. * Copyright (c) 2016, Fridolin Pokorny . All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos . All rights reserved. + * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -43,12 +44,133 @@ #define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE +static int __skb_nsg(struct sk_buff *skb, int offset, int len, + unsigned int recursion_level) +{ + int start = skb_headlen(skb); + int i, chunk = start - offset; + struct sk_buff *frag_iter; + int elt = 0; + + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + + if (chunk > 0) { + if (chunk > len) + chunk = len; + elt++; + len -= chunk; + if (len == 0) + return elt; + offset += chunk; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + chunk = end - offset; + if (chunk > 0) { + if (chunk > len) + chunk = len; + elt++; + len -= chunk; + if (len == 0) + return elt; + offset += chunk; + } + start = end; + } + + if (unlikely(skb_has_frag_list(skb))) { + skb_walk_frags(skb, frag_iter) { + int end, ret; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + chunk = end - offset; + if (chunk > 0) { + if (chunk > len) + chunk = len; + ret = __skb_nsg(frag_iter, offset - start, chunk, + recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; + len -= chunk; + if (len == 0) + return elt; + offset += chunk; + } + start = end; + } + } + BUG_ON(len); + return elt; +} + +/* Return the number of scatterlist elements required to completely map the + * skb, or -EMSGSIZE if the recursion depth is exceeded. + */ +static int skb_nsg(struct sk_buff *skb, int offset, int len) +{ + return __skb_nsg(skb, offset, len, 0); +} + +static void tls_decrypt_done(struct crypto_async_request *req, int err) +{ + struct aead_request *aead_req = (struct aead_request *)req; + struct scatterlist *sgout = aead_req->dst; + struct tls_sw_context_rx *ctx; + struct tls_context *tls_ctx; + struct scatterlist *sg; + struct sk_buff *skb; + unsigned int pages; + int pending; + + skb = (struct sk_buff *)req->data; + tls_ctx = tls_get_ctx(skb->sk); + ctx = tls_sw_ctx_rx(tls_ctx); + pending = atomic_dec_return(&ctx->decrypt_pending); + + /* Propagate if there was an err */ + if (err) { + ctx->async_wait.err = err; + tls_err_abort(skb->sk, err); + } + + /* After using skb->sk to propagate sk through crypto async callback + * we need to NULL it again. + */ + skb->sk = NULL; + + /* Release the skb, pages and memory allocated for crypto req */ + kfree_skb(skb); + + /* Skip the first S/G entry as it points to AAD */ + for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { + if (!sg) + break; + put_page(sg_page(sg)); + } + + kfree(aead_req); + + if (!pending && READ_ONCE(ctx->async_notify)) + complete(&ctx->async_wait.completion); +} + static int tls_do_decryption(struct sock *sk, + struct sk_buff *skb, struct scatterlist *sgin, struct scatterlist *sgout, char *iv_recv, size_t data_len, - struct aead_request *aead_req) + struct aead_request *aead_req, + bool async) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -59,324 +181,657 @@ static int tls_do_decryption(struct sock *sk, aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, (u8 *)iv_recv); - aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &ctx->async_wait); - - ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - return ret; -} - -static void trim_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, int target_size) -{ - int i = *sg_num_elem - 1; - int trim = *sg_size - target_size; - if (trim <= 0) { - WARN_ON(trim < 0); - return; + if (async) { + /* Using skb->sk to push sk through to crypto async callback + * handler. This allows propagating errors up to the socket + * if needed. It _must_ be cleared in the async handler + * before kfree_skb is called. We _know_ skb->sk is NULL + * because it is a clone from strparser. + */ + skb->sk = sk; + aead_request_set_callback(aead_req, + CRYPTO_TFM_REQ_MAY_BACKLOG, + tls_decrypt_done, skb); + atomic_inc(&ctx->decrypt_pending); + } else { + aead_request_set_callback(aead_req, + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); } - *sg_size = target_size; - while (trim >= sg[i].length) { - trim -= sg[i].length; - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - i--; + ret = crypto_aead_decrypt(aead_req); + if (ret == -EINPROGRESS) { + if (async) + return ret; - if (i < 0) - goto out; + ret = crypto_wait_req(ret, &ctx->async_wait); } - sg[i].length -= trim; - sk_mem_uncharge(sk, trim); + if (async) + atomic_dec(&ctx->decrypt_pending); -out: - *sg_num_elem = i + 1; + return ret; } -static void trim_both_sgl(struct sock *sk, int target_size) +static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, - target_size); - + sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, - target_size); + sk_msg_trim(sk, &rec->msg_encrypted, target_size); } -static int alloc_encrypted_sg(struct sock *sk, int len) +static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int rc = 0; + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_en = &rec->msg_encrypted; - rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, 0, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, 0); + return sk_msg_alloc(sk, msg_en, len, 0); +} - if (rc == -ENOSPC) - ctx->sg_encrypted_num_elem = ARRAY_SIZE(ctx->sg_encrypted_data); +static int tls_clone_plaintext_msg(struct sock *sk, int required) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl = &rec->msg_plaintext; + struct sk_msg *msg_en = &rec->msg_encrypted; + int skip, len; + + /* We add page references worth len bytes from encrypted sg + * at the end of plaintext sg. It is guaranteed that msg_en + * has enough required room (ensured by caller). + */ + len = required - msg_pl->sg.size; - return rc; + /* Skip initial bytes in msg_en's data to be able to use + * same offset of both plain and encrypted data. + */ + skip = tls_ctx->tx.prepend_size + msg_pl->sg.size; + + return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } -static int alloc_plaintext_sg(struct sock *sk, int len) +static struct tls_rec *tls_get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int rc = 0; + struct sk_msg *msg_pl, *msg_en; + struct tls_rec *rec; + int mem_size; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); - if (rc == -ENOSPC) - ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data); + rec = kzalloc(mem_size, sk->sk_allocation); + if (!rec) + return NULL; - return rc; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + sk_msg_init(msg_pl); + sk_msg_init(msg_en); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); + + return rec; } -static void free_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size) +static void tls_free_rec(struct sock *sk, struct tls_rec *rec) { - int i, n = *sg_num_elem; + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); +} + +static void tls_free_open_rec(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; - for (i = 0; i < n; ++i) { - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (rec) { + tls_free_rec(sk, rec); + ctx->open_rec = NULL; } - *sg_num_elem = 0; - *sg_size = 0; } -static void tls_free_both_sg(struct sock *sk) +int tls_tx_records(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + struct sk_msg *msg_en; + int tx_flags, rc = 0; + + if (tls_is_partially_sent_record(tls_ctx)) { + rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; + + rc = tls_push_partial_record(sk, tls_ctx, tx_flags); + if (rc) + goto tx_err; + + /* Full record has been transmitted. + * Remove the head of tx_list + */ + list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); + } + + /* Tx all ready records */ + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { + if (READ_ONCE(rec->tx_ready)) { + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; + + msg_en = &rec->msg_encrypted; + rc = tls_push_sg(sk, tls_ctx, + &msg_en->sg.data[msg_en->sg.curr], + 0, tx_flags); + if (rc) + goto tx_err; + + list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); + } else { + break; + } + } + +tx_err: + if (rc < 0 && rc != -EAGAIN) + tls_err_abort(sk, EBADMSG); + + return rc; +} + +static void tls_encrypt_done(struct crypto_async_request *req, int err) { + struct aead_request *aead_req = (struct aead_request *)req; + struct sock *sk = req->data; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct scatterlist *sge; + struct sk_msg *msg_en; + struct tls_rec *rec; + bool ready = false; + int pending; + + rec = container_of(aead_req, struct tls_rec, aead_req); + msg_en = &rec->msg_encrypted; + + sge = sk_msg_elem(msg_en, msg_en->sg.curr); + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; + + /* Check if error is previously set on socket */ + if (err || sk->sk_err) { + rec = NULL; + + /* If err is already set on socket, return the same code */ + if (sk->sk_err) { + ctx->async_wait.err = sk->sk_err; + } else { + ctx->async_wait.err = err; + tls_err_abort(sk, err); + } + } + + if (rec) { + struct tls_rec *first_rec; - free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size); + /* Mark the record as ready for transmission */ + smp_store_mb(rec->tx_ready, true); - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); + /* If received record is at head of tx_list, schedule tx */ + first_rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + if (rec == first_rec) + ready = true; + } + + pending = atomic_dec_return(&ctx->encrypt_pending); + + if (!pending && READ_ONCE(ctx->async_notify)) + complete(&ctx->async_wait.completion); + + if (!ready) + return; + + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + schedule_delayed_work(&ctx->tx_work.work, 1); } -static int tls_do_encryption(struct tls_context *tls_ctx, +static int tls_do_encryption(struct sock *sk, + struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, - size_t data_len) + size_t data_len, u32 start) { + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_en = &rec->msg_encrypted; + struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; + sge->offset += tls_ctx->tx.prepend_size; + sge->length -= tls_ctx->tx.prepend_size; + + msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &ctx->async_wait); + tls_encrypt_done, sk); + + /* Add the record in tx_list */ + list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); + atomic_inc(&ctx->encrypt_pending); - rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); + rc = crypto_aead_encrypt(aead_req); + if (!rc || rc != -EINPROGRESS) { + atomic_dec(&ctx->encrypt_pending); + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; + } - ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + if (!rc) { + WRITE_ONCE(rec->tx_ready, true); + } else if (rc != -EINPROGRESS) { + list_del(&rec->list); + return rc; + } + /* Unhook the record from context if encryption is not failure */ + ctx->open_rec = NULL; + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } -static int tls_push_record(struct sock *sk, int flags, - unsigned char record_type) +static int tls_split_open_record(struct sock *sk, struct tls_rec *from, + struct tls_rec **to, struct sk_msg *msg_opl, + struct sk_msg *msg_oen, u32 split_point, + u32 tx_overhead_size, u32 *orig_end) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct aead_request *req; - int rc; + u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; + struct scatterlist *sge, *osge, *nsge; + u32 orig_size = msg_opl->sg.size; + struct scatterlist tmp = { }; + struct sk_msg *msg_npl; + struct tls_rec *new; + int ret; - req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); - if (!req) + new = tls_get_rec(sk); + if (!new) return -ENOMEM; + ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + + tx_overhead_size, 0); + if (ret < 0) { + tls_free_rec(sk, new); + return ret; + } - sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); - sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); - - tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, - tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, - record_type); - - tls_fill_prepend(tls_ctx, - page_address(sg_page(&ctx->sg_encrypted_data[0])) + - ctx->sg_encrypted_data[0].offset, - ctx->sg_plaintext_size, record_type); - - tls_ctx->pending_open_record_frags = 0; - set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); + *orig_end = msg_opl->sg.end; + i = msg_opl->sg.start; + sge = sk_msg_elem(msg_opl, i); + while (apply && sge->length) { + if (sge->length > apply) { + u32 len = sge->length - apply; + + get_page(sg_page(sge)); + sg_set_page(&tmp, sg_page(sge), len, + sge->offset + apply); + sge->length = apply; + bytes += apply; + apply = 0; + } else { + apply -= sge->length; + bytes += sge->length; + } - rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); - if (rc < 0) { - /* If we are called from write_space and - * we fail, we need to set this SOCK_NOSPACE - * to trigger another write_space in the future. - */ - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto out_req; + sk_msg_iter_var_next(i); + if (i == msg_opl->sg.end) + break; + sge = sk_msg_elem(msg_opl, i); } - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); + msg_opl->sg.end = i; + msg_opl->sg.curr = i; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = 0; + msg_opl->sg.size = bytes; + + msg_npl = &new->msg_plaintext; + msg_npl->apply_bytes = apply; + msg_npl->sg.size = orig_size - bytes; + + j = msg_npl->sg.start; + nsge = sk_msg_elem(msg_npl, j); + if (tmp.length) { + memcpy(nsge, &tmp, sizeof(*nsge)); + sk_msg_iter_var_next(j); + nsge = sk_msg_elem(msg_npl, j); + } - ctx->sg_encrypted_num_elem = 0; - ctx->sg_encrypted_size = 0; + osge = sk_msg_elem(msg_opl, i); + while (osge->length) { + memcpy(nsge, osge, sizeof(*nsge)); + sg_unmark_end(nsge); + sk_msg_iter_var_next(i); + sk_msg_iter_var_next(j); + if (i == *orig_end) + break; + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + } - /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ - rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); - if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); + msg_npl->sg.end = j; + msg_npl->sg.curr = j; + msg_npl->sg.copybreak = 0; - tls_advance_record_sn(sk, &tls_ctx->tx); -out_req: - aead_request_free(req); - return rc; + *to = new; + return 0; } -static int tls_sw_push_pending_record(struct sock *sk, int flags) +static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, + struct tls_rec *from, u32 orig_end) { - return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); + struct sk_msg *msg_npl = &from->msg_plaintext; + struct sk_msg *msg_opl = &to->msg_plaintext; + struct scatterlist *osge, *nsge; + u32 i, j; + + i = msg_opl->sg.end; + sk_msg_iter_var_prev(i); + j = msg_npl->sg.start; + + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + + if (sg_page(osge) == sg_page(nsge) && + osge->offset + osge->length == nsge->offset) { + osge->length += nsge->length; + put_page(sg_page(nsge)); + } + + msg_opl->sg.end = orig_end; + msg_opl->sg.curr = orig_end; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; + msg_opl->sg.size += msg_npl->sg.size; + + sk_msg_free(sk, &to->msg_encrypted); + sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); + + kfree(from); } -static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length, int *pages_used, - unsigned int *size_used, - struct scatterlist *to, int to_max_pages, - bool charge) +static int tls_push_record(struct sock *sk, int flags, + unsigned char record_type) { - struct page *pages[MAX_SKB_FRAGS]; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec, *tmp = NULL; + u32 i, split_point, uninitialized_var(orig_end); + struct sk_msg *msg_pl, *msg_en; + struct aead_request *req; + bool split; + int rc; - size_t offset; - ssize_t copied, use; - int i = 0; - unsigned int size = *size_used; - int num_elem = *pages_used; - int rc = 0; - int maxpages; + if (!rec) + return 0; - while (length > 0) { - i = 0; - maxpages = to_max_pages - num_elem; - if (maxpages == 0) { - rc = -EFAULT; - goto out; - } - copied = iov_iter_get_pages(from, pages, - length, - maxpages, &offset); - if (copied <= 0) { - rc = -EFAULT; - goto out; - } + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + split_point = msg_pl->apply_bytes; + split = split_point && split_point < msg_pl->sg.size; + if (split) { + rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, + split_point, tls_ctx->tx.overhead_size, + &orig_end); + if (rc < 0) + return rc; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + } - iov_iter_advance(from, copied); + rec->tx_flags = flags; + req = &rec->aead_req; - length -= copied; - size += copied; - while (copied) { - use = min_t(int, copied, PAGE_SIZE - offset); + i = msg_pl->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_pl, i)); - sg_set_page(&to[num_elem], - pages[i], use, offset); - sg_unmark_end(&to[num_elem]); - if (charge) - sk_mem_charge(sk, use); + i = msg_pl->sg.start; + sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? + &msg_en->sg.data[i] : &msg_pl->sg.data[i]); - offset = 0; - copied -= use; + i = msg_en->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_en, i)); + + i = msg_en->sg.start; + sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); + + tls_make_aad(rec->aad_space, msg_pl->sg.size, + tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, + record_type); + + tls_fill_prepend(tls_ctx, + page_address(sg_page(&msg_en->sg.data[i])) + + msg_en->sg.data[i].offset, msg_pl->sg.size, + record_type); + + tls_ctx->pending_open_record_frags = false; - ++i; - ++num_elem; + rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); + if (rc < 0) { + if (rc != -EINPROGRESS) { + tls_err_abort(sk, EBADMSG); + if (split) { + tls_ctx->pending_open_record_frags = true; + tls_merge_open_record(sk, rec, tmp, orig_end); + } } + return rc; + } else if (split) { + msg_pl = &tmp->msg_plaintext; + msg_en = &tmp->msg_encrypted; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + tls_ctx->pending_open_record_frags = true; + ctx->open_rec = tmp; } - /* Mark the end in the last sg entry if newly added */ - if (num_elem > *pages_used) - sg_mark_end(&to[num_elem - 1]); -out: - if (rc) - iov_iter_revert(from, size - *size_used); - *size_used = size; - *pages_used = num_elem; - - return rc; + return tls_tx_records(sk, flags); } -static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, - int bytes) +static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, + bool full_record, u8 record_type, + size_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct scatterlist *sg = ctx->sg_plaintext_data; - int copy, i, rc = 0; - - for (i = tls_ctx->pending_open_record_frags; - i < ctx->sg_plaintext_num_elem; ++i) { - copy = sg[i].length; - if (copy_from_iter( - page_address(sg_page(&sg[i])) + sg[i].offset, - copy, from) != copy) { - rc = -EFAULT; - goto out; + struct sk_msg msg_redir = { }; + struct sk_psock *psock; + struct sock *sk_redir; + struct tls_rec *rec; + int err = 0, send; + bool enospc; + + psock = sk_psock_get(sk); + if (!psock) + return tls_push_record(sk, flags, record_type); +more_data: + enospc = sk_msg_full(msg); + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && + !enospc && !full_record) { + err = -ENOSPC; + goto out_err; + } + msg->cork_bytes = 0; + send = msg->sg.size; + if (msg->apply_bytes && msg->apply_bytes < send) + send = msg->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = tls_push_record(sk, flags, record_type); + if (err < 0) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + goto out_err; + } + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + memcpy(&msg_redir, msg, sizeof(*msg)); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + sk_msg_return_zero(sk, msg, send); + msg->sg.size -= send; + release_sock(sk); + err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + lock_sock(sk); + if (err < 0) { + *copied -= sk_msg_free_nocharge(sk, &msg_redir); + msg->sg.size = 0; } - bytes -= copy; + if (msg->sg.size == 0) + tls_free_open_rec(sk); + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, send); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + if (msg->sg.size == 0) + tls_free_open_rec(sk); + *copied -= send; + err = -EACCES; + } - ++tls_ctx->pending_open_record_frags; + if (likely(!err)) { + bool reset_eval = !ctx->open_rec; - if (!bytes) - break; + rec = ctx->open_rec; + if (rec) { + msg = &rec->msg_plaintext; + if (!msg->apply_bytes) + reset_eval = true; + } + if (reset_eval) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (rec) + goto more_data; } + out_err: + sk_psock_put(sk, psock); + return err; +} -out: - return rc; +static int tls_sw_push_pending_record(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl; + size_t copied; + + if (!rec) + return 0; + + msg_pl = &rec->msg_plaintext; + copied = msg_pl->sg.size; + if (!copied) + return 0; + + return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, + &copied, flags); } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; - int required_size; - long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send); + bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + unsigned char record_type = TLS_RECORD_TYPE_DATA; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; - unsigned char record_type = TLS_RECORD_TYPE_DATA; - int record_room; + struct sk_msg *msg_pl, *msg_en; + struct tls_rec *rec; + int required_size; + int num_async = 0; bool full_record; + int record_room; + int num_zc = 0; int orig_size; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + int ret = 0; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; lock_sock(sk); - if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) - goto send_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto send_end; + } if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); - if (ret) - goto send_end; + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; + } } while (msg_data_left(msg)) { @@ -385,22 +840,35 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - orig_size = ctx->sg_plaintext_size; + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto send_end; + } + + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + try_to_copy + + required_size = msg_pl->sg.size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + alloc_encrypted: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -409,66 +877,88 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_encrypted_size; + try_to_copy -= required_size - msg_en->sg.size; full_record = true; } - if (!is_kvec && (full_record || eor)) { - ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, - ctx->sg_plaintext_data, - ARRAY_SIZE(ctx->sg_plaintext_data), - true); + + if (!is_kvec && (full_record || eor) && !async_capable) { + u32 first = msg_pl->sg.end; + + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, + msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; + rec->inplace_crypto = 0; + + num_zc++; copied += try_to_copy; - ret = tls_push_record(sk, msg->msg_flags, record_type); - if (ret) - goto send_end; - continue; + sk_msg_sg_copy_set(msg_pl, first); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret == -ENOSPC) + goto rollback_iter; + else if (ret != -EAGAIN) + goto send_end; + } + continue; +rollback_iter: + copied -= try_to_copy; + sk_msg_sg_copy_clear(msg_pl, first); + iov_iter_revert(&msg->msg_iter, + msg_pl->sg.size - orig_size); fallback_to_reg_send: - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, - orig_size); + sk_msg_trim(sk, msg_pl, orig_size); } - required_size = ctx->sg_plaintext_size + try_to_copy; -alloc_plaintext: - ret = alloc_plaintext_sg(sk, required_size); + required_size = msg_pl->sg.size + try_to_copy; + + ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) - goto wait_for_memory; + goto send_end; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_plaintext_size; + try_to_copy -= required_size - msg_pl->sg.size; full_record = true; - - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, - ctx->sg_plaintext_size + - tls_ctx->tx.overhead_size); + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); } - ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); - if (ret) + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, + try_to_copy); + if (ret < 0) goto trim_sgl; + /* Open records defined only if successfully copied, otherwise + * we would trim the sg but not reset the open record frags. + */ + tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { -push_record: - ret = tls_push_record(sk, msg->msg_flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { - if (ret == -ENOMEM) + if (ret == -EINPROGRESS) + num_async++; + else if (ret == -ENOMEM) goto wait_for_memory; - - goto send_end; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; + goto send_end; + } } } @@ -480,17 +970,37 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - trim_both_sgl(sk, orig_size); + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - - if (ctx->sg_encrypted_size < required_size) + if (msg_en->sg.size < required_size) goto alloc_encrypted; + } - goto alloc_plaintext; + if (!num_async) { + goto send_end; + } else if (num_zc) { + /* Wait for pending encryptions to get completed */ + smp_store_mb(ctx->async_notify, true); + + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + else + reinit_completion(&ctx->async_wait.completion); + + WRITE_ONCE(ctx->async_notify, false); + + if (ctx->async_wait.err) { + ret = ctx->async_wait.err; + copied = 0; + } + } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); } send_end: @@ -503,16 +1013,18 @@ send_end: int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - bool eor; - size_t orig_size = size; unsigned char record_type = TLS_RECORD_TYPE_DATA; - struct scatterlist *sg; + struct sk_msg *msg_pl; + struct tls_rec *rec; + int num_async = 0; + size_t copied = 0; bool full_record; int record_room; + int ret = 0; + bool eor; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) @@ -525,8 +1037,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) - goto sendpage_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto sendpage_end; + } /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { @@ -537,20 +1053,33 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto sendpage_end; + } + + msg_pl = &rec->msg_plaintext; + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + copied = 0; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + copy + - tls_ctx->tx.overhead_size; + + required_size = msg_pl->sg.size + copy + + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_payload: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -559,33 +1088,32 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - ctx->sg_plaintext_size; + copy -= required_size - msg_pl->sg.size; full_record = true; } - get_page(page); - sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; - sg_set_page(sg, page, copy, offset); - sg_unmark_end(sg); - - ctx->sg_plaintext_num_elem++; - + sk_msg_page_add(msg_pl, page, copy, offset); sk_mem_charge(sk, copy); + offset += copy; size -= copy; - ctx->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem; - - if (full_record || eor || - ctx->sg_plaintext_num_elem == - ARRAY_SIZE(ctx->sg_plaintext_data)) { -push_record: - ret = tls_push_record(sk, flags, record_type); + copied += copy; + + tls_ctx->pending_open_record_frags = true; + if (full_record || eor || sk_msg_full(msg_pl)) { + rec->inplace_crypto = 0; + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, flags); if (ret) { - if (ret == -ENOMEM) + if (ret == -EINPROGRESS) + num_async++; + else if (ret == -ENOMEM) goto wait_for_memory; - - goto sendpage_end; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; + goto sendpage_end; + } } } continue; @@ -594,35 +1122,35 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, ctx->sg_plaintext_size); + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - goto alloc_payload; } + if (num_async) { + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, flags); + } + } sendpage_end: - if (orig_size > size) - ret = orig_size - size; - else - ret = sk_stream_error(sk, flags, ret); - + ret = sk_stream_error(sk, flags, ret); release_sock(sk); - return ret; + return copied ? copied : ret; } -static struct sk_buff *tls_wait_data(struct sock *sk, int flags, - long timeo, int *err) +static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); - while (!(skb = ctx->recv_pkt)) { + while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) { if (sk->sk_err) { *err = sock_error(sk); return NULL; @@ -641,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_wait_event(sk, &timeo, + ctx->recv_pkt != skb || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -655,6 +1186,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from, + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, + int to_max_pages) +{ + int rc = 0, i = 0, num_elem = *pages_used, maxpages; + struct page *pages[MAX_SKB_FRAGS]; + unsigned int size = *size_used; + ssize_t copied, use; + size_t offset; + + while (length > 0) { + i = 0; + maxpages = to_max_pages - num_elem; + if (maxpages == 0) { + rc = -EFAULT; + goto out; + } + copied = iov_iter_get_pages(from, pages, + length, + maxpages, &offset); + if (copied <= 0) { + rc = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + + length -= copied; + size += copied; + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + + sg_set_page(&to[num_elem], + pages[i], use, offset); + sg_unmark_end(&to[num_elem]); + /* We do not uncharge memory from this API */ + + offset = 0; + copied -= use; + + i++; + num_elem++; + } + } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); +out: + if (rc) + iov_iter_revert(from, size - *size_used); + *size_used = size; + *pages_used = num_elem; + + return rc; +} + /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either @@ -684,12 +1273,14 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; else n_sgout = sg_nents(out_sg); + n_sgin = skb_nsg(skb, rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); } else { n_sgout = 0; *zc = false; + n_sgin = skb_cow_data(skb, 0, &unused); } - n_sgin = skb_cow_data(skb, 0, &unused); if (n_sgin < 1) return -EBADMSG; @@ -750,9 +1341,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); *chunk = 0; - err = zerocopy_from_iter(sk, out_iov, data_len, &pages, - chunk, &sgout[1], - (n_sgout - 1), false); + err = tls_setup_from_iter(sk, out_iov, data_len, + &pages, chunk, &sgout[1], + (n_sgout - 1)); if (err < 0) goto fallback_to_reg_recv; } else if (out_sg) { @@ -769,7 +1360,10 @@ fallback_to_reg_recv: } /* Prepare and submit AEAD request */ - err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req); + err = tls_do_decryption(sk, skb, sgin, sgout, iv, + data_len, aead_req, *zc); + if (err == -EINPROGRESS) + return err; /* Release the pages in case iov was mapped to pages */ for (; pages > 0; pages--) @@ -794,8 +1388,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, #endif if (!ctx->decrypted) { err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); - if (err < 0) + if (err < 0) { + if (err == -EINPROGRESS) + tls_advance_record_sn(sk, &tls_ctx->rx); + return err; + } } else { *zc = false; } @@ -823,18 +1421,20 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); - if (len < rxm->full_len) { - rxm->offset += len; - rxm->full_len -= len; + if (skb) { + struct strp_msg *rxm = strp_msg(skb); - return false; + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + return false; + } + kfree_skb(skb); } /* Finished with message */ ctx->recv_pkt = NULL; - kfree_skb(skb); __strp_unpause(&ctx->strp); return true; @@ -849,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -856,26 +1457,41 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); + int num_async = 0; flags |= nonblock; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + psock = sk_psock_get(sk); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { bool zc = false; + bool async = false; int chunk = 0; - skb = tls_wait_data(sk, flags, timeo, &err); - if (!skb) + skb = tls_wait_data(sk, psock, flags, timeo, &err); + if (!skb) { + if (psock) { + int ret = __tcp_bpf_recvmsg(sk, psock, + msg, len, flags); + + if (ret > 0) { + copied += ret; + len -= ret; + continue; + } + } goto recv_end; + } rxm = strp_msg(skb); + if (!cmsg) { int cerr; @@ -902,26 +1518,39 @@ int tls_sw_recvmsg(struct sock *sk, err = decrypt_skb_update(sk, skb, &msg->msg_iter, &chunk, &zc); - if (err < 0) { + if (err < 0 && err != -EINPROGRESS) { tls_err_abort(sk, EBADMSG); goto recv_end; } + + if (err == -EINPROGRESS) { + async = true; + num_async++; + goto pick_next_record; + } + ctx->decrypted = true; } if (!zc) { chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, chunk); if (err < 0) goto recv_end; } +pick_next_record: copied += chunk; len -= chunk; if (likely(!(flags & MSG_PEEK))) { u8 control = ctx->control; + /* For async, drop current skb reference */ + if (async) + skb = NULL; + if (tls_sw_advance_skb(sk, skb, chunk)) { /* Return full control message to * userspace before trying to parse @@ -930,6 +1559,8 @@ int tls_sw_recvmsg(struct sock *sk, msg->msg_flags |= MSG_EOR; if (control != TLS_RECORD_TYPE_DATA) goto recv_end; + } else { + break; } } else { /* MSG_PEEK right now cannot look beyond current skb @@ -946,7 +1577,25 @@ int tls_sw_recvmsg(struct sock *sk, } while (len); recv_end: + if (num_async) { + /* Wait for all previously submitted records to be decrypted */ + smp_store_mb(ctx->async_notify, true); + if (atomic_read(&ctx->decrypt_pending)) { + err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + if (err) { + /* one of async decrypt failed */ + tls_err_abort(sk, err); + copied = 0; + } + } else { + reinit_completion(&ctx->async_wait.completion); + } + WRITE_ONCE(ctx->async_notify, false); + } + release_sock(sk); + if (psock) + sk_psock_put(sk, psock); return copied ? : err; } @@ -969,7 +1618,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skb = tls_wait_data(sk, flags, timeo, &err); + skb = tls_wait_data(sk, NULL, flags, timeo, &err); if (!skb) goto splice_read_end; @@ -1003,23 +1652,20 @@ splice_read_end: return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +bool tls_sw_stream_read(const struct sock *sk) { - unsigned int ret; - struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + bool ingress_empty = true; + struct sk_psock *psock; - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); - - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); - if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + ingress_empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); - return ret; + return !ingress_empty || ctx->recv_pkt; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1098,17 +1744,66 @@ static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; strp_data_ready(&ctx->strp); + + psock = sk_psock_get(sk); + if (psock && !list_empty(&psock->ingress_msg)) { + ctx->saved_data_ready(sk); + sk_psock_put(sk, psock); + } } void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + + /* Wait for any pending async encryptions to complete */ + smp_store_mb(ctx->async_notify, true); + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + + cancel_delayed_work_sync(&ctx->tx_work.work); + + /* Tx whatever records we can transmit and abandon the rest */ + tls_tx_records(sk, -1); + + /* Free up un-sent records in tx_list. First, free + * the partially sent record if any at head of tx_list. + */ + if (tls_ctx->partially_sent_record) { + struct scatterlist *sg = tls_ctx->partially_sent_record; + + while (1) { + put_page(sg_page(sg)); + sk_mem_uncharge(sk, sg->length); + + if (sg_is_last(sg)) + break; + sg++; + } + + tls_ctx->partially_sent_record = NULL; + + rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); + } + + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { + list_del(&rec->list); + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); + } crypto_free_aead(ctx->aead_send); - tls_free_both_sg(sk); + tls_free_open_rec(sk); kfree(ctx); } @@ -1142,6 +1837,24 @@ void tls_sw_free_resources_rx(struct sock *sk) kfree(ctx); } +/* The work handler to transmitt the encrypted records in tx_list */ +static void tx_work_handler(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct tx_work *tx_work = container_of(delayed_work, + struct tx_work, work); + struct sock *sk = tx_work->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + return; + + lock_sock(sk); + tls_tx_records(sk, -1); + release_sock(sk); +} + int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { struct tls_crypto_info *crypto_info; @@ -1191,6 +1904,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; + INIT_LIST_HEAD(&sw_ctx_tx->tx_list); + INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); + sw_ctx_tx->tx_work.sk = sk; } else { crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv.info; @@ -1241,26 +1957,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_iv; } - if (sw_ctx_tx) { - sg_init_table(sw_ctx_tx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data)); - sg_init_table(sw_ctx_tx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data)); - - sg_init_table(sw_ctx_tx->sg_aead_in, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]); - sg_chain(sw_ctx_tx->sg_aead_in, 2, - sw_ctx_tx->sg_plaintext_data); - sg_init_table(sw_ctx_tx->sg_aead_out, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]); - sg_chain(sw_ctx_tx->sg_aead_out, 2, - sw_ctx_tx->sg_encrypted_data); - } - if (!*aead) { *aead = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(*aead)) { @@ -1294,8 +1990,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; - strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d1edfa3cad61..74d1eed7cbd4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -225,6 +225,8 @@ static inline void unix_release_addr(struct unix_address *addr) static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { + *hashp = 0; + if (len <= sizeof(short) || len > sizeof(*sunaddr)) return -EINVAL; if (!sunaddr || sunaddr->sun_family != AF_UNIX) @@ -2640,7 +2642,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ @@ -2677,7 +2679,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, unsigned int writable; __poll_t mask; - sock_poll_wait(file, wait); + sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a88551f3bc43..5bd01058b9e6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1019,36 +1019,49 @@ void cfg80211_cqm_config_free(struct wireless_dev *wdev) wdev->cqm_config = NULL; } -void cfg80211_unregister_wdev(struct wireless_dev *wdev) +static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_RTNL(); - if (WARN_ON(wdev->netdev)) - return; - nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); list_del_rcu(&wdev->list); - synchronize_rcu(); + if (sync) + synchronize_rcu(); rdev->devlist_generation++; + cfg80211_mlme_purge_registrations(wdev); + switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: - cfg80211_mlme_purge_registrations(wdev); cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: - WARN_ON_ONCE(1); break; } +#ifdef CONFIG_CFG80211_WEXT + kzfree(wdev->wext.keys); +#endif + /* only initialized if we have a netdev */ + if (wdev->netdev) + flush_work(&wdev->disconnect_wk); + cfg80211_cqm_config_free(wdev); } + +void cfg80211_unregister_wdev(struct wireless_dev *wdev) +{ + if (WARN_ON(wdev->netdev)) + return; + + __cfg80211_unregister_wdev(wdev, true); +} EXPORT_SYMBOL(cfg80211_unregister_wdev); static const struct device_type wiphy_type = { @@ -1153,6 +1166,30 @@ void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_stop_iface); +void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + mutex_init(&wdev->mtx); + INIT_LIST_HEAD(&wdev->event_list); + spin_lock_init(&wdev->event_lock); + INIT_LIST_HEAD(&wdev->mgmt_registrations); + spin_lock_init(&wdev->mgmt_registrations_lock); + + /* + * We get here also when the interface changes network namespaces, + * as it's registered into the new one, but we don't want it to + * change ID in that case. Checking if the ID is already assigned + * works, because 0 isn't considered a valid ID and the memory is + * 0-initialized. + */ + if (!wdev->identifier) + wdev->identifier = ++rdev->wdev_id; + list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); + rdev->devlist_generation++; + + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); +} + static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { @@ -1178,23 +1215,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * called within code protected by it when interfaces * are added with nl80211. */ - mutex_init(&wdev->mtx); - INIT_LIST_HEAD(&wdev->event_list); - spin_lock_init(&wdev->event_lock); - INIT_LIST_HEAD(&wdev->mgmt_registrations); - spin_lock_init(&wdev->mgmt_registrations_lock); - - /* - * We get here also when the interface changes network namespaces, - * as it's registered into the new one, but we don't want it to - * change ID in that case. Checking if the ID is already assigned - * works, because 0 isn't considered a valid ID and the memory is - * 0-initialized. - */ - if (!wdev->identifier) - wdev->identifier = ++rdev->wdev_id; - list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); - rdev->devlist_generation++; /* can only change netns with wiphy */ dev->features |= NETIF_F_NETNS_LOCAL; @@ -1223,7 +1243,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); - nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); + cfg80211_init_wdev(rdev, wdev); break; case NETDEV_GOING_DOWN: cfg80211_leave(rdev, wdev); @@ -1238,7 +1258,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { - if (WARN_ON(pos && pos->dev == wdev->netdev)) + if (WARN_ON(pos->dev == wdev->netdev)) cfg80211_stop_sched_scan_req(rdev, pos, false); } @@ -1302,17 +1322,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * remove and clean it up. */ if (!list_empty(&wdev->list)) { - nl80211_notify_iface(rdev, wdev, - NL80211_CMD_DEL_INTERFACE); + __cfg80211_unregister_wdev(wdev, false); sysfs_remove_link(&dev->dev.kobj, "phy80211"); - list_del_rcu(&wdev->list); - rdev->devlist_generation++; - cfg80211_mlme_purge_registrations(wdev); -#ifdef CONFIG_CFG80211_WEXT - kzfree(wdev->wext.keys); -#endif - flush_work(&wdev->disconnect_wk); - cfg80211_cqm_config_free(wdev); } /* * synchronise (so that we won't find this netdev diff --git a/net/wireless/core.h b/net/wireless/core.h index 7f52ef569320..c61dbba8bf47 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -66,6 +66,7 @@ struct cfg80211_registered_device { /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; + u64 cookie_counter; /* BSSes/scanning */ spinlock_t bss_lock; @@ -133,6 +134,16 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) #endif } +static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev) +{ + u64 r = ++rdev->cookie_counter; + + if (WARN_ON(r == 0)) + r = ++rdev->cookie_counter; + + return r; +} + extern struct workqueue_struct *cfg80211_wq; extern struct list_head cfg80211_rdev_list; extern int cfg80211_rdev_list_generation; @@ -187,6 +198,9 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net); +void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + static inline void wdev_lock(struct wireless_dev *wdev) __acquires(wdev) { diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index e6bce1f130c9..b5e235573c8a 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include @@ -64,9 +64,9 @@ struct lib80211_tkip_data { int key_idx; - struct crypto_skcipher *rx_tfm_arc4; + struct crypto_cipher *rx_tfm_arc4; struct crypto_shash *rx_tfm_michael; - struct crypto_skcipher *tx_tfm_arc4; + struct crypto_cipher *tx_tfm_arc4; struct crypto_shash *tx_tfm_michael; /* scratch buffers for virt_to_page() (crypto API) */ @@ -99,8 +99,7 @@ static void *lib80211_tkip_init(int key_idx) priv->key_idx = key_idx; - priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm_arc4)) { priv->tx_tfm_arc4 = NULL; goto fail; @@ -112,8 +111,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm_arc4)) { priv->rx_tfm_arc4 = NULL; goto fail; @@ -130,9 +128,9 @@ static void *lib80211_tkip_init(int key_idx) fail: if (priv) { crypto_free_shash(priv->tx_tfm_michael); - crypto_free_skcipher(priv->tx_tfm_arc4); + crypto_free_cipher(priv->tx_tfm_arc4); crypto_free_shash(priv->rx_tfm_michael); - crypto_free_skcipher(priv->rx_tfm_arc4); + crypto_free_cipher(priv->rx_tfm_arc4); kfree(priv); } @@ -144,9 +142,9 @@ static void lib80211_tkip_deinit(void *priv) struct lib80211_tkip_data *_priv = priv; if (_priv) { crypto_free_shash(_priv->tx_tfm_michael); - crypto_free_skcipher(_priv->tx_tfm_arc4); + crypto_free_cipher(_priv->tx_tfm_arc4); crypto_free_shash(_priv->rx_tfm_michael); - crypto_free_skcipher(_priv->rx_tfm_arc4); + crypto_free_cipher(_priv->rx_tfm_arc4); } kfree(priv); } @@ -344,12 +342,10 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len, static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_tkip_data *tkey = priv; - SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4); int len; u8 rc4key[16], *pos, *icv; u32 crc; - struct scatterlist sg; - int err; + int i; if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; @@ -374,14 +370,10 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) icv[2] = crc >> 16; icv[3] = crc >> 24; - crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); - sg_init_one(&sg, pos, len + 4); - skcipher_request_set_tfm(req, tkey->tx_tfm_arc4); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL); - err = crypto_skcipher_encrypt(req); - skcipher_request_zero(req); - return err; + crypto_cipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); + for (i = 0; i < len + 4; i++) + crypto_cipher_encrypt_one(tkey->tx_tfm_arc4, pos + i, pos + i); + return 0; } /* @@ -400,7 +392,6 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_tkip_data *tkey = priv; - SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4); u8 rc4key[16]; u8 keyidx, *pos; u32 iv32; @@ -408,9 +399,8 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) struct ieee80211_hdr *hdr; u8 icv[4]; u32 crc; - struct scatterlist sg; int plen; - int err; + int i; hdr = (struct ieee80211_hdr *)skb->data; @@ -463,18 +453,9 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) plen = skb->len - hdr_len - 12; - crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); - sg_init_one(&sg, pos, plen + 4); - skcipher_request_set_tfm(req, tkey->rx_tfm_arc4); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL); - err = crypto_skcipher_decrypt(req); - skcipher_request_zero(req); - if (err) { - net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n", - hdr->addr2); - return -7; - } + crypto_cipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); + for (i = 0; i < plen + 4; i++) + crypto_cipher_decrypt_one(tkey->rx_tfm_arc4, pos + i, pos + i); crc = ~crc32_le(~0, pos, plen); icv[0] = crc; @@ -660,9 +641,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) struct lib80211_tkip_data *tkey = priv; int keyidx; struct crypto_shash *tfm = tkey->tx_tfm_michael; - struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; + struct crypto_cipher *tfm2 = tkey->tx_tfm_arc4; struct crypto_shash *tfm3 = tkey->rx_tfm_michael; - struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; + struct crypto_cipher *tfm4 = tkey->rx_tfm_arc4; keyidx = tkey->key_idx; memset(tkey, 0, sizeof(*tkey)); diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index d05f58b0fd04..6015f6b542a6 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -22,7 +22,7 @@ #include -#include +#include #include MODULE_AUTHOR("Jouni Malinen"); @@ -35,8 +35,8 @@ struct lib80211_wep_data { u8 key[WEP_KEY_LEN + 1]; u8 key_len; u8 key_idx; - struct crypto_skcipher *tx_tfm; - struct crypto_skcipher *rx_tfm; + struct crypto_cipher *tx_tfm; + struct crypto_cipher *rx_tfm; }; static void *lib80211_wep_init(int keyidx) @@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx) goto fail; priv->key_idx = keyidx; - priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->tx_tfm)) { priv->tx_tfm = NULL; goto fail; } - priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(priv->rx_tfm)) { priv->rx_tfm = NULL; goto fail; @@ -66,8 +66,8 @@ static void *lib80211_wep_init(int keyidx) fail: if (priv) { - crypto_free_skcipher(priv->tx_tfm); - crypto_free_skcipher(priv->rx_tfm); + crypto_free_cipher(priv->tx_tfm); + crypto_free_cipher(priv->rx_tfm); kfree(priv); } return NULL; @@ -77,8 +77,8 @@ static void lib80211_wep_deinit(void *priv) { struct lib80211_wep_data *_priv = priv; if (_priv) { - crypto_free_skcipher(_priv->tx_tfm); - crypto_free_skcipher(_priv->rx_tfm); + crypto_free_cipher(_priv->tx_tfm); + crypto_free_cipher(_priv->rx_tfm); } kfree(priv); } @@ -129,12 +129,10 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len, static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_wep_data *wep = priv; - SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm); u32 crc, klen, len; u8 *pos, *icv; - struct scatterlist sg; u8 key[WEP_KEY_LEN + 3]; - int err; + int i; /* other checks are in lib80211_wep_build_iv */ if (skb_tailroom(skb) < 4) @@ -162,14 +160,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) icv[2] = crc >> 16; icv[3] = crc >> 24; - crypto_skcipher_setkey(wep->tx_tfm, key, klen); - sg_init_one(&sg, pos, len + 4); - skcipher_request_set_tfm(req, wep->tx_tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL); - err = crypto_skcipher_encrypt(req); - skcipher_request_zero(req); - return err; + crypto_cipher_setkey(wep->tx_tfm, key, klen); + + for (i = 0; i < len + 4; i++) + crypto_cipher_encrypt_one(wep->tx_tfm, pos + i, pos + i); + + return 0; } /* Perform WEP decryption on given buffer. Buffer includes whole WEP part of @@ -182,12 +178,10 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct lib80211_wep_data *wep = priv; - SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm); u32 crc, klen, plen; u8 key[WEP_KEY_LEN + 3]; u8 keyidx, *pos, icv[4]; - struct scatterlist sg; - int err; + int i; if (skb->len < hdr_len + 8) return -1; @@ -208,15 +202,9 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) /* Apply RC4 to data and compute CRC32 over decrypted data */ plen = skb->len - hdr_len - 8; - crypto_skcipher_setkey(wep->rx_tfm, key, klen); - sg_init_one(&sg, pos, plen + 4); - skcipher_request_set_tfm(req, wep->rx_tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL); - err = crypto_skcipher_decrypt(req); - skcipher_request_zero(req); - if (err) - return -7; + crypto_cipher_setkey(wep->rx_tfm, key, klen); + for (i = 0; i < plen + 4; i++) + crypto_cipher_decrypt_one(wep->rx_tfm, pos + i, pos + i); crc = ~crc32_le(~0, pos, plen); icv[0] = crc; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4b8ec659e797..744b5851bbf9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -200,7 +200,46 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) return __cfg80211_rdev_from_attrs(netns, info->attrs); } +static int validate_ie_attr(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *pos; + int len; + + pos = nla_data(attr); + len = nla_len(attr); + + while (len) { + u8 elemlen; + + if (len < 2) + goto error; + len -= 2; + + elemlen = pos[1]; + if (elemlen > len) + goto error; + + len -= elemlen; + pos += 2 + elemlen; + } + + return 0; +error: + NL_SET_ERR_MSG_ATTR(extack, attr, "malformed information elements"); + return -EINVAL; +} + /* policy for the attributes */ +static const struct nla_policy +nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { + [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, }, + [NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY, + .len = U8_MAX }, + [NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY, + .len = U8_MAX }, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, @@ -213,14 +252,14 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, - [NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 }, - [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_ATTR_WIPHY_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, - [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_IFTYPE] = NLA_POLICY_MAX(NLA_U32, NL80211_IFTYPE_MAX), [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, @@ -230,24 +269,28 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN }, - [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 }, + [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 5), [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 }, - [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_KEY_TYPE] = + NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES), [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, - [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, - [NL80211_ATTR_STA_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_BEACON_TAIL] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_ATTR_STA_AID] = + NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 }, [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY, .len = NL80211_MAX_SUPP_RATES }, - [NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 }, + [NL80211_ATTR_STA_PLINK_ACTION] = + NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1), [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, @@ -270,8 +313,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN }, [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, - [NL80211_ATTR_IE] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + validate_ie_attr, + IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED }, [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED }, @@ -281,7 +325,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG }, [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG }, - [NL80211_ATTR_USE_MFP] = { .type = NLA_U32 }, + [NL80211_ATTR_USE_MFP] = NLA_POLICY_RANGE(NLA_U32, + NL80211_MFP_NO, + NL80211_MFP_OPTIONAL), [NL80211_ATTR_STA_FLAGS2] = { .len = sizeof(struct nl80211_sta_flag_update), }, @@ -301,7 +347,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FRAME] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, }, - [NL80211_ATTR_PS_STATE] = { .type = NLA_U32 }, + [NL80211_ATTR_PS_STATE] = NLA_POLICY_RANGE(NLA_U32, + NL80211_PS_DISABLED, + NL80211_PS_ENABLED), [NL80211_ATTR_CQM] = { .type = NLA_NESTED, }, [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG }, [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 }, @@ -314,15 +362,23 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG }, [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED }, - [NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 }, + [NL80211_ATTR_STA_PLINK_STATE] = + NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1), + [NL80211_ATTR_MESH_PEER_AID] = + NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED }, [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED }, - [NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 }, - [NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, - [NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_HIDDEN_SSID] = + NLA_POLICY_RANGE(NLA_U32, + NL80211_HIDDEN_SSID_NOT_IN_USE, + NL80211_HIDDEN_SSID_ZERO_CONTENTS), + [NL80211_ATTR_IE_PROBE_RESP] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_ATTR_IE_ASSOC_RESP] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED }, [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG }, @@ -348,9 +404,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN }, [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, - [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, - [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 }, - [NL80211_ATTR_LOCAL_MESH_POWER_MODE] = {. type = NLA_U32 }, + [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127), + [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1), + [NL80211_ATTR_LOCAL_MESH_POWER_MODE] = + NLA_POLICY_RANGE(NLA_U32, + NL80211_MESH_POWER_UNKNOWN + 1, + NL80211_MESH_POWER_MAX), [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 }, [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 }, @@ -363,7 +422,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MDID] = { .type = NLA_U16 }, [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, - [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_PEER_AID] = + NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID), [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, @@ -384,8 +444,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, - [NL80211_ATTR_TSID] = { .type = NLA_U8 }, - [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_TSID] = NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_TIDS - 1), + [NL80211_ATTR_USER_PRIO] = + NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1), [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN }, @@ -395,12 +456,13 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, - [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, + [NL80211_ATTR_STA_SUPPORT_P2P_PS] = + NLA_POLICY_MAX(NLA_U8, NUM_NL80211_P2P_PS_STATUS - 1), [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, - [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, + [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, @@ -430,6 +492,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, .len = NL80211_HE_MAX_CAPABILITY_LEN }, + + [NL80211_ATTR_FTM_RESPONDER] = { + .type = NLA_NESTED, + .validation_data = nl80211_ftm_responder_policy, + }, }; /* policy for the key attributes */ @@ -440,7 +507,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 }, [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG }, [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, - [NL80211_KEY_TYPE] = { .type = NLA_U32 }, + [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1), [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, }; @@ -491,7 +558,10 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { static const struct nla_policy nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 }, - [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 }, + [NL80211_ATTR_COALESCE_RULE_CONDITION] = + NLA_POLICY_RANGE(NLA_U32, + NL80211_COALESCE_CONDITION_MATCH, + NL80211_COALESCE_CONDITION_NO_MATCH), [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED }, }; @@ -567,8 +637,7 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, }; -static int nl80211_prepare_wdev_dump(struct sk_buff *skb, - struct netlink_callback *cb, +static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, struct cfg80211_registered_device **rdev, struct wireless_dev **wdev) { @@ -582,7 +651,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, return err; *wdev = __cfg80211_wdev_from_attrs( - sock_net(skb->sk), + sock_net(cb->skb->sk), genl_family_attrbuf(&nl80211_fam)); if (IS_ERR(*wdev)) return PTR_ERR(*wdev); @@ -614,36 +683,6 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, return 0; } -/* IE validation */ -static bool is_valid_ie_attr(const struct nlattr *attr) -{ - const u8 *pos; - int len; - - if (!attr) - return true; - - pos = nla_data(attr); - len = nla_len(attr); - - while (len) { - u8 elemlen; - - if (len < 2) - return false; - len -= 2; - - elemlen = pos[1]; - if (elemlen > len) - return false; - - len -= elemlen; - pos += 2 + elemlen; - } - - return true; -} - /* message building helper */ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) @@ -858,12 +897,8 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, if (tb[NL80211_KEY_CIPHER]) k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]); - if (tb[NL80211_KEY_TYPE]) { + if (tb[NL80211_KEY_TYPE]) k->type = nla_get_u32(tb[NL80211_KEY_TYPE]); - if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) - return genl_err_attr(info, -EINVAL, - tb[NL80211_KEY_TYPE]); - } if (tb[NL80211_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; @@ -910,13 +945,8 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) if (k->defmgmt) k->def_multi = true; - if (info->attrs[NL80211_ATTR_KEY_TYPE]) { + if (info->attrs[NL80211_ATTR_KEY_TYPE]) k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); - if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) { - GENL_SET_ERR_MSG(info, "key type out of range"); - return -EINVAL; - } - } if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; @@ -2292,12 +2322,14 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_chan_def *chandef) { + struct netlink_ext_ack *extack = info->extack; + struct nlattr **attrs = info->attrs; u32 control_freq; - if (!info->attrs[NL80211_ATTR_WIPHY_FREQ]) + if (!attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - control_freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; @@ -2305,14 +2337,16 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq2 = 0; /* Primary channel not allowed */ - if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) + if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) { + NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], + "Channel is disabled"); return -EINVAL; + } - if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { enum nl80211_channel_type chantype; - chantype = nla_get_u32( - info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + chantype = nla_get_u32(attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); switch (chantype) { case NL80211_CHAN_NO_HT: @@ -2322,42 +2356,56 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, cfg80211_chandef_create(chandef, chandef->chan, chantype); /* user input for center_freq is incorrect */ - if (info->attrs[NL80211_ATTR_CENTER_FREQ1] && - chandef->center_freq1 != nla_get_u32( - info->attrs[NL80211_ATTR_CENTER_FREQ1])) + if (attrs[NL80211_ATTR_CENTER_FREQ1] && + chandef->center_freq1 != nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1])) { + NL_SET_ERR_MSG_ATTR(extack, + attrs[NL80211_ATTR_CENTER_FREQ1], + "bad center frequency 1"); return -EINVAL; + } /* center_freq2 must be zero */ - if (info->attrs[NL80211_ATTR_CENTER_FREQ2] && - nla_get_u32(info->attrs[NL80211_ATTR_CENTER_FREQ2])) + if (attrs[NL80211_ATTR_CENTER_FREQ2] && + nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2])) { + NL_SET_ERR_MSG_ATTR(extack, + attrs[NL80211_ATTR_CENTER_FREQ2], + "center frequency 2 can't be used"); return -EINVAL; + } break; default: + NL_SET_ERR_MSG_ATTR(extack, + attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE], + "invalid channel type"); return -EINVAL; } - } else if (info->attrs[NL80211_ATTR_CHANNEL_WIDTH]) { + } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = - nla_get_u32(info->attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (info->attrs[NL80211_ATTR_CENTER_FREQ1]) + nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); + if (attrs[NL80211_ATTR_CENTER_FREQ1]) chandef->center_freq1 = - nla_get_u32( - info->attrs[NL80211_ATTR_CENTER_FREQ1]); - if (info->attrs[NL80211_ATTR_CENTER_FREQ2]) + nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = - nla_get_u32( - info->attrs[NL80211_ATTR_CENTER_FREQ2]); + nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); } - if (!cfg80211_chandef_valid(chandef)) + if (!cfg80211_chandef_valid(chandef)) { + NL_SET_ERR_MSG(extack, "invalid channel definition"); return -EINVAL; + } if (!cfg80211_chandef_usable(&rdev->wiphy, chandef, - IEEE80211_CHAN_DISABLED)) + IEEE80211_CHAN_DISABLED)) { + NL_SET_ERR_MSG(extack, "(extension) channel is disabled"); return -EINVAL; + } if ((chandef->width == NL80211_CHAN_WIDTH_5 || chandef->width == NL80211_CHAN_WIDTH_10) && - !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) { + NL_SET_ERR_MSG(extack, "5/10 MHz not supported"); return -EINVAL; + } return 0; } @@ -2617,8 +2665,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) { retry_short = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]); - if (retry_short == 0) - return -EINVAL; changed |= WIPHY_PARAM_RETRY_SHORT; } @@ -2626,8 +2672,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) { retry_long = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]); - if (retry_long == 0) - return -EINVAL; changed |= WIPHY_PARAM_RETRY_LONG; } @@ -3119,8 +3163,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); if (otype != ntype) change = true; - if (ntype > NL80211_IFTYPE_MAX) - return -EINVAL; } if (info->attrs[NL80211_ATTR_MESH_ID]) { @@ -3185,11 +3227,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_IFNAME]) return -EINVAL; - if (info->attrs[NL80211_ATTR_IFTYPE]) { + if (info->attrs[NL80211_ATTR_IFTYPE]) type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); - if (type > NL80211_IFTYPE_MAX) - return -EINVAL; - } if (!rdev->ops->add_virtual_intf || !(rdev->wiphy.interface_modes & (1 << type))) @@ -3252,15 +3291,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) * P2P Device and NAN do not have a netdev, so don't go * through the netdev notifier and must be added here */ - mutex_init(&wdev->mtx); - INIT_LIST_HEAD(&wdev->event_list); - spin_lock_init(&wdev->event_lock); - INIT_LIST_HEAD(&wdev->mgmt_registrations); - spin_lock_init(&wdev->mgmt_registrations_lock); - - wdev->identifier = ++rdev->wdev_id; - list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); - rdev->devlist_generation++; + cfg80211_init_wdev(rdev, wdev); break; default: break; @@ -3272,15 +3303,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return -ENOBUFS; } - /* - * For wdevs which have no associated netdev object (e.g. of type - * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here. - * For all other types, the event will be generated from the - * netdev notifier - */ - if (!wdev->netdev) - nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); - return genlmsg_reply(msg, info); } @@ -3359,7 +3381,7 @@ static void get_key_callback(void *c, struct key_params *params) params->cipher))) goto nla_put_failure; - if (nla_put_u8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx)) + if (nla_put_u8(cookie->msg, NL80211_KEY_IDX, cookie->idx)) goto nla_put_failure; nla_nest_end(cookie->msg, key); @@ -3386,9 +3408,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_KEY_IDX]) key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 5) - return -EINVAL; - if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -3396,8 +3415,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_KEY_TYPE]) { u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); - if (kt >= NUM_NL80211_KEYTYPES) - return -EINVAL; if (kt != NL80211_KEYTYPE_GROUP && kt != NL80211_KEYTYPE_PAIRWISE) return -EINVAL; @@ -3756,6 +3773,7 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, return false; /* check availability */ + ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN); if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) mcs[ridx] |= rbit; else @@ -3997,16 +4015,12 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return 0; } -static int nl80211_parse_beacon(struct nlattr *attrs[], +static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, + struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) { bool haveinfo = false; - - if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) || - !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) || - !is_valid_ie_attr(attrs[NL80211_ATTR_IE_PROBE_RESP]) || - !is_valid_ie_attr(attrs[NL80211_ATTR_IE_ASSOC_RESP])) - return -EINVAL; + int err; memset(bcn, 0, sizeof(*bcn)); @@ -4051,6 +4065,35 @@ static int nl80211_parse_beacon(struct nlattr *attrs[], bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]); } + if (attrs[NL80211_ATTR_FTM_RESPONDER]) { + struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX, + attrs[NL80211_ATTR_FTM_RESPONDER], + NULL, NULL); + if (err) + return err; + + if (tb[NL80211_FTM_RESP_ATTR_ENABLED] && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) + bcn->ftm_responder = 1; + else + return -EOPNOTSUPP; + + if (tb[NL80211_FTM_RESP_ATTR_LCI]) { + bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]); + bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]); + } + + if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) { + bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]); + bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]); + } + } else { + bcn->ftm_responder = -1; + } + return 0; } @@ -4095,6 +4138,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len); if (cap && cap[1] >= sizeof(*params->vht_cap)) params->vht_cap = (void *)(cap + 2); + cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); + if (cap && cap[1] >= sizeof(*params->he_cap) + 1) + params->he_cap = (void *)(cap + 3); } static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, @@ -4194,7 +4240,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_BEACON_HEAD]) return -EINVAL; - err = nl80211_parse_beacon(info->attrs, ¶ms.beacon); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon); if (err) return err; @@ -4224,14 +4270,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) { + if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) params.hidden_ssid = nla_get_u32( info->attrs[NL80211_ATTR_HIDDEN_SSID]); - if (params.hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE && - params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_LEN && - params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_CONTENTS) - return -EINVAL; - } params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; @@ -4261,8 +4302,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); - if (params.p2p_ctwindow > 127) - return -EINVAL; if (params.p2p_ctwindow != 0 && !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) return -EINVAL; @@ -4274,8 +4313,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); - if (tmp > 1) - return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps != 0 && !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) @@ -4378,7 +4415,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) if (!wdev->beacon_interval) return -EINVAL; - err = nl80211_parse_beacon(info->attrs, ¶ms); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms); if (err) return err; @@ -4724,10 +4761,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); - PUT_SINFO(ACK_SIGNAL, ack_signal, u8); + PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32); + PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32); if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT)) - PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8); + NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) { + PUT_SINFO(ACK_SIGNAL, ack_signal, u8); + PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8); + } #undef PUT_SINFO #undef PUT_SINFO_U64 @@ -4806,7 +4846,7 @@ static int nl80211_dump_station(struct sk_buff *skb, int err; rtnl_lock(); - err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) goto out_err; @@ -5211,17 +5251,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) else params.listen_interval = -1; - if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { - u8 tmp; - - tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); - if (tmp >= NUM_NL80211_P2P_PS_STATUS) - return -EINVAL; - - params.support_p2p_ps = tmp; - } else { + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) + params.support_p2p_ps = + nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); + else params.support_p2p_ps = -1; - } if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -5251,38 +5285,23 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; - if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) params.plink_action = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); - if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS) - return -EINVAL; - } if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) { params.plink_state = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); - if (params.plink_state >= NUM_NL80211_PLINK_STATES) - return -EINVAL; - if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) { + if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) params.peer_aid = nla_get_u16( info->attrs[NL80211_ATTR_MESH_PEER_AID]); - if (params.peer_aid > IEEE80211_MAX_AID) - return -EINVAL; - } params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; } - if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) { - enum nl80211_mesh_power_mode pm = nla_get_u32( + if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) + params.local_pm = nla_get_u32( info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]); - if (pm <= NL80211_MESH_POWER_UNKNOWN || - pm > NL80211_MESH_POWER_MAX) - return -EINVAL; - - params.local_pm = pm; - } - if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -5359,13 +5378,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { - u8 tmp; - - tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); - if (tmp >= NUM_NL80211_P2P_PS_STATUS) - return -EINVAL; - - params.support_p2p_ps = tmp; + params.support_p2p_ps = + nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); } else { /* * if not specified, assume it's supported for P2P GO interface, @@ -5379,8 +5393,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); else params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); - if (!params.aid || params.aid > IEEE80211_MAX_AID) - return -EINVAL; if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) { params.capability = @@ -5420,12 +5432,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } - if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) { + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) params.plink_action = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); - if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS) - return -EINVAL; - } err = nl80211_parse_sta_channel_info(info, ¶ms); if (err) @@ -5657,7 +5666,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int err; rtnl_lock(); - err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) goto out_err; @@ -5853,7 +5862,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int err; rtnl_lock(); - err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) goto out_err; @@ -5935,9 +5944,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; params.p2p_ctwindow = - nla_get_s8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); - if (params.p2p_ctwindow < 0) - return -EINVAL; + nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) return -EINVAL; @@ -5949,8 +5956,6 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); - if (tmp > 1) - return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) @@ -6129,33 +6134,49 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, return -ENOBUFS; } -static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { - [NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 }, - [NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 }, - [NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 }, - [NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 }, - [NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 }, - [NL80211_MESHCONF_TTL] = { .type = NLA_U8 }, - [NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 }, - [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 }, - [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] = { .type = NLA_U32 }, +static const struct nla_policy +nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { + [NL80211_MESHCONF_RETRY_TIMEOUT] = + NLA_POLICY_RANGE(NLA_U16, 1, 255), + [NL80211_MESHCONF_CONFIRM_TIMEOUT] = + NLA_POLICY_RANGE(NLA_U16, 1, 255), + [NL80211_MESHCONF_HOLDING_TIMEOUT] = + NLA_POLICY_RANGE(NLA_U16, 1, 255), + [NL80211_MESHCONF_MAX_PEER_LINKS] = + NLA_POLICY_RANGE(NLA_U16, 0, 255), + [NL80211_MESHCONF_MAX_RETRIES] = NLA_POLICY_MAX(NLA_U8, 16), + [NL80211_MESHCONF_TTL] = NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_MESHCONF_ELEMENT_TTL] = NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = NLA_POLICY_MAX(NLA_U8, 1), + [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] = + NLA_POLICY_RANGE(NLA_U32, 1, 255), [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 }, [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 }, - [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 }, + [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = NLA_POLICY_MIN(NLA_U16, 1), [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 }, - [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 }, - [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] = { .type = NLA_U16 }, - [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 }, - [NL80211_MESHCONF_HWMP_ROOTMODE] = { .type = NLA_U8 }, - [NL80211_MESHCONF_HWMP_RANN_INTERVAL] = { .type = NLA_U16 }, - [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = { .type = NLA_U8 }, - [NL80211_MESHCONF_FORWARDING] = { .type = NLA_U8 }, - [NL80211_MESHCONF_RSSI_THRESHOLD] = { .type = NLA_U32 }, + [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_HWMP_ROOTMODE] = NLA_POLICY_MAX(NLA_U8, 4), + [NL80211_MESHCONF_HWMP_RANN_INTERVAL] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = NLA_POLICY_MAX(NLA_U8, 1), + [NL80211_MESHCONF_FORWARDING] = NLA_POLICY_MAX(NLA_U8, 1), + [NL80211_MESHCONF_RSSI_THRESHOLD] = + NLA_POLICY_RANGE(NLA_S32, -255, 0), [NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 }, [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 }, - [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = { .type = NLA_U16 }, - [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 }, - [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 }, + [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = + NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_MESHCONF_POWER_MODE] = + NLA_POLICY_RANGE(NLA_U32, + NL80211_MESH_POWER_ACTIVE, + NL80211_MESH_POWER_MAX), [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, }; @@ -6168,68 +6189,12 @@ static const struct nla_policy [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG }, [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG }, - [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_MESH_SETUP_IE] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, }; -static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out) -{ - u8 val = nla_get_u8(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - -static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out) -{ - u8 val = nla_get_u8(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - -static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out) -{ - u16 val = nla_get_u16(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - -static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out) -{ - u32 val = nla_get_u32(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - -static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out) -{ - s32 val = nla_get_s32(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - -static int nl80211_check_power_mode(const struct nlattr *nla, - enum nl80211_mesh_power_mode min, - enum nl80211_mesh_power_mode max, - enum nl80211_mesh_power_mode *out) -{ - u32 val = nla_get_u32(nla); - if (val < min || val > max) - return -EINVAL; - *out = val; - return 0; -} - static int nl80211_parse_mesh_config(struct genl_info *info, struct mesh_config *cfg, u32 *mask_out) @@ -6238,13 +6203,12 @@ static int nl80211_parse_mesh_config(struct genl_info *info, u32 mask = 0; u16 ht_opmode; -#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \ -do { \ - if (tb[attr]) { \ - if (fn(tb[attr], min, max, &cfg->param)) \ - return -EINVAL; \ - mask |= (1 << (attr - 1)); \ - } \ +#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, mask, attr, fn) \ +do { \ + if (tb[attr]) { \ + cfg->param = fn(tb[attr]); \ + mask |= BIT((attr) - 1); \ + } \ } while (0) if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) @@ -6259,75 +6223,73 @@ do { \ BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); /* Fill in the params struct */ - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255, - mask, NL80211_MESHCONF_RETRY_TIMEOUT, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255, - mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255, - mask, NL80211_MESHCONF_HOLDING_TIMEOUT, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255, - mask, NL80211_MESHCONF_MAX_PEER_LINKS, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16, - mask, NL80211_MESHCONF_MAX_RETRIES, - nl80211_check_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255, - mask, NL80211_MESHCONF_TTL, nl80211_check_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255, - mask, NL80211_MESHCONF_ELEMENT_TTL, - nl80211_check_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1, - mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, - nl80211_check_bool); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, mask, + NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, mask, + NL80211_MESHCONF_CONFIRM_TIMEOUT, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, mask, + NL80211_MESHCONF_HOLDING_TIMEOUT, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, mask, + NL80211_MESHCONF_MAX_PEER_LINKS, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, mask, + NL80211_MESHCONF_MAX_RETRIES, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, mask, + NL80211_MESHCONF_TTL, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, mask, + NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, mask, + NL80211_MESHCONF_AUTO_OPEN_PLINKS, + nla_get_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, - 1, 255, mask, + mask, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, - nl80211_check_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255, - mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, - nl80211_check_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535, - mask, NL80211_MESHCONF_PATH_REFRESH_TIME, - nl80211_check_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535, - mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, - nl80211_check_u16); + nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, mask, + NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, + nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, mask, + NL80211_MESHCONF_PATH_REFRESH_TIME, + nla_get_u32); + if (mask & BIT(NL80211_MESHCONF_PATH_REFRESH_TIME) && + (cfg->path_refresh_time < 1 || cfg->path_refresh_time > 65535)) + return -EINVAL; + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, mask, + NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, + nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, - 1, 65535, mask, + mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, - nl80211_check_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, - 1, 65535, mask, + nla_get_u32); + if (mask & BIT(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT) && + (cfg->dot11MeshHWMPactivePathTimeout < 1 || + cfg->dot11MeshHWMPactivePathTimeout > 65535)) + return -EINVAL; + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, - 1, 65535, mask, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, mask, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, - nl80211_check_u16); + nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshHWMPnetDiameterTraversalTime, - 1, 65535, mask, + dot11MeshHWMPnetDiameterTraversalTime, mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4, - mask, NL80211_MESHCONF_HWMP_ROOTMODE, - nl80211_check_u8); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535, - mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshGateAnnouncementProtocol, 0, 1, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, mask, + NL80211_MESHCONF_HWMP_ROOTMODE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, mask, + NL80211_MESHCONF_HWMP_RANN_INTERVAL, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol, mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS, - nl80211_check_bool); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1, - mask, NL80211_MESHCONF_FORWARDING, - nl80211_check_bool); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0, - mask, NL80211_MESHCONF_RSSI_THRESHOLD, - nl80211_check_s32); + nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, mask, + NL80211_MESHCONF_FORWARDING, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask, + NL80211_MESHCONF_RSSI_THRESHOLD, + nla_get_s32); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. @@ -6346,29 +6308,27 @@ do { \ cfg->ht_opmode = ht_opmode; mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); } - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, - 1, 65535, mask, - NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, - nl80211_check_u32); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535, - mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL, - nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, - dot11MeshHWMPconfirmationInterval, - 1, 65535, mask, + dot11MeshHWMPactivePathToRootTimeout, mask, + NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, + nla_get_u32); + if (mask & BIT(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT) && + (cfg->dot11MeshHWMPactivePathToRootTimeout < 1 || + cfg->dot11MeshHWMPactivePathToRootTimeout > 65535)) + return -EINVAL; + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, mask, + NL80211_MESHCONF_HWMP_ROOT_INTERVAL, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval, + mask, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, - nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, - NL80211_MESH_POWER_ACTIVE, - NL80211_MESH_POWER_MAX, - mask, NL80211_MESHCONF_POWER_MODE, - nl80211_check_power_mode); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, - 0, 65535, mask, - NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, - mask, NL80211_MESHCONF_PLINK_TIMEOUT, - nl80211_check_u32); + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, mask, + NL80211_MESHCONF_POWER_MODE, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, mask, + NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, + NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); if (mask_out) *mask_out = mask; @@ -6411,8 +6371,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, if (tb[NL80211_MESH_SETUP_IE]) { struct nlattr *ieattr = tb[NL80211_MESH_SETUP_IE]; - if (!is_valid_ie_attr(ieattr)) - return -EINVAL; setup->ie = nla_data(ieattr); setup->ie_len = nla_len(ieattr); } @@ -7045,9 +7003,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) int err, tmp, n_ssids = 0, n_channels, i; size_t ie_len; - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - wiphy = &rdev->wiphy; if (wdev->iftype == NL80211_IFTYPE_NAN) @@ -7401,9 +7356,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF; - if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE])) - return ERR_PTR(-EINVAL); - if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { n_channels = validate_scan_freqs( attrs[NL80211_ATTR_SCAN_FREQUENCIES]); @@ -7763,7 +7715,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, */ if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { while (!sched_scan_req->reqid) - sched_scan_req->reqid = rdev->wiphy.cookie_counter++; + sched_scan_req->reqid = cfg80211_assign_cookie(rdev); } err = rdev_sched_scan_start(rdev, dev, sched_scan_req); @@ -7939,7 +7891,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (!need_new_beacon) goto skip_beacons; - err = nl80211_parse_beacon(info->attrs, ¶ms.beacon_after); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_after); if (err) return err; @@ -7949,7 +7901,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = nl80211_parse_beacon(csa_attrs, ¶ms.beacon_csa); + err = nl80211_parse_beacon(rdev, csa_attrs, ¶ms.beacon_csa); if (err) return err; @@ -8186,7 +8138,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int err; rtnl_lock(); - err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) { rtnl_unlock(); return err; @@ -8307,7 +8259,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) bool radio_stats; rtnl_lock(); - res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (res) goto out_err; @@ -8371,9 +8323,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) struct key_parse key; bool local_state_change; - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -8612,9 +8561,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) return -EPERM; - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_MAC] || !info->attrs[NL80211_ATTR_SSID] || !info->attrs[NL80211_ATTR_WIPHY_FREQ]) @@ -8738,9 +8684,6 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) return -EPERM; - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -8789,9 +8732,6 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) return -EPERM; - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -8866,9 +8806,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) memset(&ibss, 0, sizeof(ibss)); - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_SSID] || !nla_len(info->attrs[NL80211_ATTR_SSID])) return -EINVAL; @@ -9306,9 +9243,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) memset(&connect, 0, sizeof(connect)); - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_SSID] || !nla_len(info->attrs[NL80211_ATTR_SSID])) return -EINVAL; @@ -9367,11 +9301,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) !wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_MFP_OPTIONAL)) return -EOPNOTSUPP; - - if (connect.mfp != NL80211_MFP_REQUIRED && - connect.mfp != NL80211_MFP_NO && - connect.mfp != NL80211_MFP_OPTIONAL) - return -EINVAL; } else { connect.mfp = NL80211_MFP_NO; } @@ -9544,8 +9473,6 @@ static int nl80211_update_connect_params(struct sk_buff *skb, return -EOPNOTSUPP; if (info->attrs[NL80211_ATTR_IE]) { - if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) - return -EINVAL; connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]); connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); changed |= UPDATE_ASSOC_IES; @@ -10130,9 +10057,6 @@ static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]); - if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) - return -EINVAL; - wdev = dev->ieee80211_ptr; if (!rdev->ops->set_power_mgmt) @@ -10230,7 +10154,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; u32 hyst; - int i, n; + int i, n, low_index; int err; /* RSSI reporting disabled? */ @@ -10267,10 +10191,19 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (last < wdev->cqm_config->rssi_thresholds[i]) break; - low = i > 0 ? - (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN; - high = i < n ? - (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX; + low_index = i - 1; + if (low_index >= 0) { + low_index = array_index_nospec(low_index, n); + low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + } else { + low = S32_MIN; + } + if (i < n) { + i = array_index_nospec(i, n); + high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + } else { + high = S32_MAX; + } return rdev_set_cqm_rssi_range_config(rdev, dev, low, high); } @@ -10686,8 +10619,7 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (!scan_plan) return -ENOBUFS; - if (!scan_plan || - nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL, + if (nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL, req->scan_plans[i].interval) || (req->scan_plans[i].iterations && nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS, @@ -11285,9 +11217,6 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION]) new_rule->condition = nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]); - if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH && - new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH) - return -EINVAL; if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN]) return -EINVAL; @@ -11640,8 +11569,6 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) conf.master_pref = nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (!conf.master_pref) - return -EINVAL; if (info->attrs[NL80211_ATTR_BANDS]) { u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); @@ -11759,7 +11686,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, if (!func) return -ENOMEM; - func->cookie = wdev->wiphy->cookie_counter++; + func->cookie = cfg80211_assign_cookie(rdev); if (!tb[NL80211_NAN_FUNC_TYPE] || nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) { @@ -12205,8 +12132,7 @@ static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_MDID] || - !info->attrs[NL80211_ATTR_IE] || - !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + !info->attrs[NL80211_ATTR_IE]) return -EINVAL; memset(&ft_params, 0, sizeof(ft_params)); @@ -12626,12 +12552,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) return -EINVAL; tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); - if (tsid >= IEEE80211_NUM_TIDS) - return -EINVAL; - up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); - if (up >= IEEE80211_NUM_UPS) - return -EINVAL; /* WMM uses TIDs 0-7 even for TSPEC */ if (tsid >= IEEE80211_FIRST_TSPEC_TSID) { @@ -12989,6 +12910,76 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_ftm_responder_stats ftm_stats = {}; + struct sk_buff *msg; + void *hdr; + struct nlattr *ftm_stats_attr; + int err; + + if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval) + return -EOPNOTSUPP; + + err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats); + if (err) + return err; + + if (!ftm_stats.filled) + return -ENODATA; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_GET_FTM_RESPONDER_STATS); + if (!hdr) + return -ENOBUFS; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS); + if (!ftm_stats_attr) + goto nla_put_failure; + +#define SET_FTM(field, name, type) \ + do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \ + nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name, \ + ftm_stats.field)) \ + goto nla_put_failure; } while (0) +#define SET_FTM_U64(field, name) \ + do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \ + nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name, \ + ftm_stats.field, NL80211_FTM_STATS_PAD)) \ + goto nla_put_failure; } while (0) + + SET_FTM(success_num, SUCCESS_NUM, u32); + SET_FTM(partial_num, PARTIAL_NUM, u32); + SET_FTM(failed_num, FAILED_NUM, u32); + SET_FTM(asap_num, ASAP_NUM, u32); + SET_FTM(non_asap_num, NON_ASAP_NUM, u32); + SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC); + SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32); + SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32); + SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32); +#undef SET_FTM + + nla_nest_end(msg, ftm_stats_attr); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13900,6 +13891,13 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, + .doit = nl80211_get_ftm_responder_stats, + .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 364f5d67f05b..51380b5c32f2 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1232,4 +1232,19 @@ rdev_external_auth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ftm_responder_stats *ftm_stats) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); + if (rdev->ops->get_ftm_responder_stats) + ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, + ftm_stats); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f702adf2912..ecfb1a06dbb2 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -847,22 +847,36 @@ static bool valid_regdb(const u8 *data, unsigned int size) return true; } -static void set_wmm_rule(struct ieee80211_reg_rule *rrule, - struct fwdb_wmm_rule *wmm) -{ - struct ieee80211_wmm_rule *rule = &rrule->wmm_rule; - unsigned int i; +static void set_wmm_rule(const struct fwdb_header *db, + const struct fwdb_country *country, + const struct fwdb_rule *rule, + struct ieee80211_reg_rule *rrule) +{ + struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule; + struct fwdb_wmm_rule *wmm; + unsigned int i, wmm_ptr; + + wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; + wmm = (void *)((u8 *)db + wmm_ptr); + + if (!valid_wmm(wmm)) { + pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n", + be32_to_cpu(rule->start), be32_to_cpu(rule->end), + country->alpha2[0], country->alpha2[1]); + return; + } for (i = 0; i < IEEE80211_NUM_ACS; i++) { - rule->client[i].cw_min = + wmm_rule->client[i].cw_min = ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); - rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); - rule->client[i].aifsn = wmm->client[i].aifsn; - rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); - rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); - rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); - rule->ap[i].aifsn = wmm->ap[i].aifsn; - rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); + wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); + wmm_rule->client[i].aifsn = wmm->client[i].aifsn; + wmm_rule->client[i].cot = + 1000 * be16_to_cpu(wmm->client[i].cot); + wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); + wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); + wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn; + wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); } rrule->has_wmm = true; @@ -870,7 +884,7 @@ static void set_wmm_rule(struct ieee80211_reg_rule *rrule, static int __regdb_query_wmm(const struct fwdb_header *db, const struct fwdb_country *country, int freq, - struct ieee80211_reg_rule *rule) + struct ieee80211_reg_rule *rrule) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); @@ -879,18 +893,14 @@ static int __regdb_query_wmm(const struct fwdb_header *db, for (i = 0; i < coll->n_rules; i++) { __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; - struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr); - struct fwdb_wmm_rule *wmm; - unsigned int wmm_ptr; + struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); - if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr)) + if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr)) continue; - if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) && - freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) { - wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2; - wmm = (void *)((u8 *)db + wmm_ptr); - set_wmm_rule(rule, wmm); + if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) && + freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) { + set_wmm_rule(db, country, rule, rrule); return 0; } } @@ -972,12 +982,8 @@ static int regdb_query_country(const struct fwdb_header *db, if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) rrule->dfs_cac_ms = 1000 * be16_to_cpu(rule->cac_timeout); - if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { - u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; - struct fwdb_wmm_rule *wmm = (void *)((u8 *)db + wmm_ptr); - - set_wmm_rule(rrule, wmm); - } + if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) + set_wmm_rule(db, country, rule, rrule); } return reg_schedule_apply(regdom); @@ -2661,11 +2667,12 @@ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; + enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - switch (reg_request->initiator) { + switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; @@ -2683,7 +2690,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: - WARN(1, "invalid initiator %d\n", reg_request->initiator); + WARN(1, "invalid initiator %d\n", initiator); goto out_free; } @@ -2698,7 +2705,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } @@ -2867,6 +2874,7 @@ static int regulatory_hint_core(const char *alpha2) request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; + request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); @@ -3184,13 +3192,59 @@ static void restore_regulatory_settings(bool reset_user) schedule_work(®_work); } +static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + wdev_lock(wdev); + if (!(wdev->wiphy->regulatory_flags & flag)) { + wdev_unlock(wdev); + return false; + } + wdev_unlock(wdev); + } + } + + return true; +} + void regulatory_hint_disconnect(void) { + /* Restore of regulatory settings is not required when wiphy(s) + * ignore IE from connected access point but clearance of beacon hints + * is required when wiphy(s) supports beacon hints. + */ + if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) { + struct reg_beacon *reg_beacon, *btmp; + + if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS)) + return; + + spin_lock_bh(®_pending_beacons_lock); + list_for_each_entry_safe(reg_beacon, btmp, + ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + spin_unlock_bh(®_pending_beacons_lock); + + list_for_each_entry_safe(reg_beacon, btmp, + ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + + return; + } + pr_debug("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false); } -static bool freq_is_chan_12_13_14(u16 freq) +static bool freq_is_chan_12_13_14(u32 freq) { if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || @@ -3777,6 +3831,15 @@ static int __init regulatory_init_db(void) { int err; + /* + * It's possible that - due to other bugs/issues - cfg80211 + * never called regulatory_init() below, or that it failed; + * in that case, don't try to do any further work here as + * it's doomed to lead to crashes. + */ + if (IS_ERR_OR_NULL(reg_pdev)) + return -EINVAL; + err = load_builtin_regdb_keys(); if (err) return err; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d36c3eb7b931..d0e7472dd9fd 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1058,13 +1058,23 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, return NULL; } +/* + * Update RX channel information based on the available frame payload + * information. This is mainly for the 2.4 GHz band where frames can be received + * from neighboring channels and the Beacon frames use the DSSS Parameter Set + * element to indicate the current (transmitting) channel, but this might also + * be needed on other bands if RX frequency does not match with the actual + * operating channel of a BSS. + */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel) + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width) { const u8 *tmp; u32 freq; int channel_number = -1; + struct ieee80211_channel *alt_channel; tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp[1] == 1) { @@ -1078,16 +1088,45 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } } - if (channel_number < 0) + if (channel_number < 0) { + /* No channel information in frame payload */ return channel; + } freq = ieee80211_channel_to_frequency(channel_number, channel->band); - channel = ieee80211_get_channel(wiphy, freq); - if (!channel) - return NULL; - if (channel->flags & IEEE80211_CHAN_DISABLED) + alt_channel = ieee80211_get_channel(wiphy, freq); + if (!alt_channel) { + if (channel->band == NL80211_BAND_2GHZ) { + /* + * Better not allow unexpected channels when that could + * be going beyond the 1-11 range (e.g., discovering + * BSS on channel 12 when radio is configured for + * channel 11. + */ + return NULL; + } + + /* No match for the payload channel number - ignore it */ + return channel; + } + + if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || + scan_width == NL80211_BSS_CHAN_WIDTH_5) { + /* + * Ignore channel number in 5 and 10 MHz channels where there + * may not be an n:1 or 1:n mapping between frequencies and + * channel numbers. + */ + return channel; + } + + /* + * Use the channel determined through the payload channel number + * instead of the RX channel reported by the driver. + */ + if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; - return channel; + return alt_channel; } /* Returned bss is reference counted and must be cleaned up appropriately. */ @@ -1112,7 +1151,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, (data->signal < 0 || data->signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan, + data->scan_width); if (!channel) return NULL; @@ -1210,7 +1250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, data->chan); + ielen, data->chan, data->scan_width); if (!channel) return NULL; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 7c73510b161f..c6a9446b4e6b 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -112,7 +112,7 @@ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ - __field(u16, center_freq) + __field(u32, center_freq) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ @@ -2368,6 +2368,140 @@ TRACE_EVENT(rdev_external_auth, __entry->bssid, __entry->ssid, __entry->status) ); +TRACE_EVENT(rdev_start_radar_detection, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_chan_def *chandef, + u32 cac_time_ms), + TP_ARGS(wiphy, netdev, chandef, cac_time_ms), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + CHAN_DEF_ENTRY + __field(u32, cac_time_ms) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + __entry->cac_time_ms = cac_time_ms; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT + ", cac_time_ms=%u", + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, + __entry->cac_time_ms) +); + +TRACE_EVENT(rdev_set_mcast_rate, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + int *mcast_rate), + TP_ARGS(wiphy, netdev, mcast_rate), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(int, mcast_rate, NUM_NL80211_BANDS) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memcpy(__entry->mcast_rate, mcast_rate, + sizeof(int) * NUM_NL80211_BANDS); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " + "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->mcast_rate[NL80211_BAND_2GHZ], + __entry->mcast_rate[NL80211_BAND_5GHZ], + __entry->mcast_rate[NL80211_BAND_60GHZ]) +); + +TRACE_EVENT(rdev_set_coalesce, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce), + TP_ARGS(wiphy, coalesce), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(int, n_rules) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->n_rules = coalesce ? coalesce->n_rules : 0; + ), + TP_printk(WIPHY_PR_FMT ", n_rules=%d", + WIPHY_PR_ARG, __entry->n_rules) +); + +DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); + +TRACE_EVENT(rdev_set_multicast_to_unicast, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const bool enabled), + TP_ARGS(wiphy, netdev, enabled), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(bool, enabled) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->enabled = enabled; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, + BOOL_TO_STR(__entry->enabled)) +); + +DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); + +TRACE_EVENT(rdev_get_ftm_responder_stats, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ftm_responder_stats *ftm_stats), + + TP_ARGS(wiphy, netdev, ftm_stats), + + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u64, timestamp) + __field(u32, success_num) + __field(u32, partial_num) + __field(u32, failed_num) + __field(u32, asap_num) + __field(u32, non_asap_num) + __field(u64, duration) + __field(u32, unknown_triggers) + __field(u32, reschedule) + __field(u32, out_of_window) + ), + + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->success_num = ftm_stats->success_num; + __entry->partial_num = ftm_stats->partial_num; + __entry->failed_num = ftm_stats->failed_num; + __entry->asap_num = ftm_stats->asap_num; + __entry->non_asap_num = ftm_stats->non_asap_num; + __entry->duration = ftm_stats->total_duration_ms; + __entry->unknown_triggers = ftm_stats->unknown_triggers_num; + __entry->reschedule = ftm_stats->reschedule_requests_num; + __entry->out_of_window = ftm_stats->out_of_window_triggers_num; + ), + + TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, " + "failed %u, asap %u, non asap %u, total duration %llu, unknown " + "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG, + __entry->success_num, __entry->partial_num, __entry->failed_num, + __entry->asap_num, __entry->non_asap_num, __entry->duration, + __entry->unknown_triggers, __entry->reschedule, + __entry->out_of_window) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -3160,105 +3294,6 @@ TRACE_EVENT(cfg80211_stop_iface, TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); - -TRACE_EVENT(rdev_start_radar_detection, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_chan_def *chandef, - u32 cac_time_ms), - TP_ARGS(wiphy, netdev, chandef, cac_time_ms), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - CHAN_DEF_ENTRY - __field(u32, cac_time_ms) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - CHAN_DEF_ASSIGN(chandef); - __entry->cac_time_ms = cac_time_ms; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT - ", cac_time_ms=%u", - WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, - __entry->cac_time_ms) -); - -TRACE_EVENT(rdev_set_mcast_rate, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - int *mcast_rate), - TP_ARGS(wiphy, netdev, mcast_rate), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __array(int, mcast_rate, NUM_NL80211_BANDS) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - memcpy(__entry->mcast_rate, mcast_rate, - sizeof(int) * NUM_NL80211_BANDS); - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " - "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]", - WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->mcast_rate[NL80211_BAND_2GHZ], - __entry->mcast_rate[NL80211_BAND_5GHZ], - __entry->mcast_rate[NL80211_BAND_60GHZ]) -); - -TRACE_EVENT(rdev_set_coalesce, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce), - TP_ARGS(wiphy, coalesce), - TP_STRUCT__entry( - WIPHY_ENTRY - __field(int, n_rules) - ), - TP_fast_assign( - WIPHY_ASSIGN; - __entry->n_rules = coalesce ? coalesce->n_rules : 0; - ), - TP_printk(WIPHY_PR_FMT ", n_rules=%d", - WIPHY_PR_ARG, __entry->n_rules) -); - -DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev) -); - -TRACE_EVENT(rdev_set_multicast_to_unicast, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const bool enabled), - TP_ARGS(wiphy, netdev, enabled), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __field(bool, enabled) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - __entry->enabled = enabled; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, - BOOL_TO_STR(__entry->enabled)) -); - -TRACE_EVENT(rdev_get_txq_stats, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev), - TP_STRUCT__entry( - WIPHY_ENTRY - WDEV_ENTRY - ), - TP_fast_assign( - WIPHY_ASSIGN; - WDEV_ASSIGN; - ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) -); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 959ed3acd240..ef14d80ca03e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,17 +5,20 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation */ #include #include #include #include +#include #include #include #include #include #include #include +#include #include "core.h" #include "rdev-ops.h" @@ -88,7 +91,7 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) return 5000 + chan * 5; break; case NL80211_BAND_60GHZ: - if (chan < 5) + if (chan < 7) return 56160 + chan * 2160; break; default: @@ -109,7 +112,7 @@ int ieee80211_frequency_to_channel(int freq) return (freq - 4000) / 5; else if (freq <= 45000) /* DMG band lower limit */ return (freq - 5000) / 5; - else if (freq >= 58320 && freq <= 64800) + else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; @@ -1568,7 +1571,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, } /* 56.16 GHz, channel 1..4 */ - if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) { + if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; @@ -1893,3 +1896,154 @@ EXPORT_SYMBOL(rfc1042_header); const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); + +/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ +struct iapp_layer2_update { + u8 da[ETH_ALEN]; /* broadcast */ + u8 sa[ETH_ALEN]; /* STA addr */ + __be16 len; /* 6 */ + u8 dsap; /* 0 */ + u8 ssap; /* 0 */ + u8 control; + u8 xid_info[3]; +} __packed; + +void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) +{ + struct iapp_layer2_update *msg; + struct sk_buff *skb; + + /* Send Level 2 Update Frame to update forwarding tables in layer 2 + * bridge devices */ + + skb = dev_alloc_skb(sizeof(*msg)); + if (!skb) + return; + msg = skb_put(skb, sizeof(*msg)); + + /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) + * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ + + eth_broadcast_addr(msg->da); + ether_addr_copy(msg->sa, addr); + msg->len = htons(6); + msg->dsap = 0; + msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ + msg->control = 0xaf; /* XID response lsb.1111F101. + * F=0 (no poll command; unsolicited frame) */ + msg->xid_info[0] = 0x81; /* XID format identifier */ + msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ + msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ + + skb->dev = dev; + skb->protocol = eth_type_trans(skb, dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_rx_ni(skb); +} +EXPORT_SYMBOL(cfg80211_send_layer2_update); + +int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, + enum ieee80211_vht_chanwidth bw, + int mcs, bool ext_nss_bw_capable) +{ + u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); + int max_vht_nss = 0; + int ext_nss_bw; + int supp_width; + int i, mcs_encoding; + + if (map == 0xffff) + return 0; + + if (WARN_ON(mcs > 9)) + return 0; + if (mcs <= 7) + mcs_encoding = 0; + else if (mcs == 8) + mcs_encoding = 1; + else + mcs_encoding = 2; + + /* find max_vht_nss for the given MCS */ + for (i = 7; i >= 0; i--) { + int supp = (map >> (2 * i)) & 3; + + if (supp == 3) + continue; + + if (supp >= mcs_encoding) { + max_vht_nss = i; + break; + } + } + + if (!(cap->supp_mcs.tx_mcs_map & + cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) + return max_vht_nss; + + ext_nss_bw = le32_get_bits(cap->vht_cap_info, + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); + supp_width = le32_get_bits(cap->vht_cap_info, + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); + + /* if not capable, treat ext_nss_bw as 0 */ + if (!ext_nss_bw_capable) + ext_nss_bw = 0; + + /* This is invalid */ + if (supp_width == 3) + return 0; + + /* This is an invalid combination so pretend nothing is supported */ + if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) + return 0; + + /* + * Cover all the special cases according to IEEE 802.11-2016 + * Table 9-250. All other cases are either factor of 1 or not + * valid/supported. + */ + switch (bw) { + case IEEE80211_VHT_CHANWIDTH_USE_HT: + case IEEE80211_VHT_CHANWIDTH_80MHZ: + if ((supp_width == 1 || supp_width == 2) && + ext_nss_bw == 3) + return 2 * max_vht_nss; + break; + case IEEE80211_VHT_CHANWIDTH_160MHZ: + if (supp_width == 0 && + (ext_nss_bw == 1 || ext_nss_bw == 2)) + return DIV_ROUND_UP(max_vht_nss, 2); + if (supp_width == 0 && + ext_nss_bw == 3) + return DIV_ROUND_UP(3 * max_vht_nss, 4); + if (supp_width == 1 && + ext_nss_bw == 3) + return 2 * max_vht_nss; + break; + case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + if (supp_width == 0 && + (ext_nss_bw == 1 || ext_nss_bw == 2)) + return 0; /* not possible */ + if (supp_width == 0 && + ext_nss_bw == 2) + return DIV_ROUND_UP(max_vht_nss, 2); + if (supp_width == 0 && + ext_nss_bw == 3) + return DIV_ROUND_UP(3 * max_vht_nss, 4); + if (supp_width == 1 && + ext_nss_bw == 0) + return 0; /* not possible */ + if (supp_width == 1 && + ext_nss_bw == 1) + return DIV_ROUND_UP(max_vht_nss, 2); + if (supp_width == 1 && + ext_nss_bw == 2) + return DIV_ROUND_UP(3 * max_vht_nss, 4); + break; + } + + /* not covered or invalid combination received */ + return max_vht_nss; +} +EXPORT_SYMBOL(ieee80211_get_vht_max_nss); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 167f7025ac98..06943d9c9835 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,12 +1278,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) - return -EOPNOTSUPP; + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { + err = -EOPNOTSUPP; + goto free; + } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); - return 0; +free: + cfg80211_sinfo_release_content(&sinfo); + return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ @@ -1293,7 +1297,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; - static struct station_info sinfo; + static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) @@ -1352,6 +1356,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; + cfg80211_sinfo_release_content(&sinfo); + return &wstats; } diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index bfe2dbea480b..a264cf2accd0 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -32,37 +32,49 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; - if (xs->dev) { - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); - - if (umem->zc) - synchronize_net(); - } + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); } -int xdp_umem_query(struct net_device *dev, u16 queue_id) +/* The umem is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, + u16 queue_id) { - struct netdev_bpf bpf; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].umem = umem; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].umem = umem; +} - ASSERT_RTNL(); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].umem; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].umem; - memset(&bpf, 0, sizeof(bpf)); - bpf.command = XDP_QUERY_XSK_UMEM; - bpf.xsk.queue_id = queue_id; + return NULL; +} - if (!dev->netdev_ops->ndo_bpf) - return 0; - return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].umem = NULL; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].umem = NULL; } int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags) + u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; - int err; + int err = 0; force_zc = flags & XDP_ZEROCOPY; force_copy = flags & XDP_COPY; @@ -70,19 +82,23 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_zc && force_copy) return -EINVAL; - if (force_copy) - return 0; - - if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) - return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ + rtnl_lock(); + if (xdp_get_umem_from_qid(dev, queue_id)) { + err = -EBUSY; + goto out_rtnl_unlock; + } - bpf.command = XDP_QUERY_XSK_UMEM; + xdp_reg_umem_at_qid(dev, umem, queue_id); + umem->dev = dev; + umem->queue_id = queue_id; + if (force_copy) + /* For copy-mode, we are done. */ + goto out_rtnl_unlock; - rtnl_lock(); - err = xdp_umem_query(dev, queue_id); - if (err) { - err = err < 0 ? -EOPNOTSUPP : -EBUSY; - goto err_rtnl_unlock; + if (!dev->netdev_ops->ndo_bpf || + !dev->netdev_ops->ndo_xsk_async_xmit) { + err = -EOPNOTSUPP; + goto err_unreg_umem; } bpf.command = XDP_SETUP_XSK_UMEM; @@ -91,18 +107,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, err = dev->netdev_ops->ndo_bpf(dev, &bpf); if (err) - goto err_rtnl_unlock; + goto err_unreg_umem; rtnl_unlock(); dev_hold(dev); - umem->dev = dev; - umem->queue_id = queue_id; umem->zc = true; return 0; -err_rtnl_unlock: +err_unreg_umem: + xdp_clear_umem_at_qid(dev, queue_id); + if (!force_zc) + err = 0; /* fallback to copy mode */ +out_rtnl_unlock: rtnl_unlock(); - return force_zc ? err : 0; /* fail or fallback */ + return err; } static void xdp_umem_clear_dev(struct xdp_umem *umem) @@ -110,7 +128,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) struct netdev_bpf bpf; int err; - if (umem->dev) { + if (umem->zc) { bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; @@ -121,9 +139,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) if (err) WARN(1, "failed to disable umem!\n"); + } + + if (umem->dev) { + rtnl_lock(); + xdp_clear_umem_at_qid(umem->dev, umem->queue_id); + rtnl_unlock(); + } + if (umem->zc) { dev_put(umem->dev); - umem->dev = NULL; + umem->zc = false; } } @@ -167,6 +193,8 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } + xsk_reuseq_destroy(umem); + xdp_umem_unpin_pages(umem); task = get_pid_task(umem->pid, PIDTYPE_PID); @@ -314,8 +342,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pid = get_task_pid(current, PIDTYPE_PID); umem->address = (unsigned long)addr; - umem->props.chunk_mask = ~((u64)chunk_size - 1); - umem->props.size = size; + umem->chunk_mask = ~((u64)chunk_size - 1); + umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index f11560334f88..27603227601b 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -8,18 +8,8 @@ #include -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); -} - int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags); + u16 queue_id, u16 flags); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h deleted file mode 100644 index 40eab10dfc49..000000000000 --- a/net/xdp/xdp_umem_props.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* XDP user-space packet buffer - * Copyright(c) 2018 Intel Corporation. - */ - -#ifndef XDP_UMEM_PROPS_H_ -#define XDP_UMEM_PROPS_H_ - -struct xdp_umem_props { - u64 chunk_mask; - u64 size; -}; - -#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e937cd7c17d..07156f43d295 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -55,20 +55,30 @@ EXPORT_SYMBOL(xsk_umem_discard_addr); static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *buffer; + void *to_buf, *from_buf; + u32 metalen; u64 addr; int err; if (!xskq_peek_addr(xs->umem->fq, &addr) || - len > xs->umem->chunk_size_nohr) { + len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } addr += xs->umem->headroom; - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data, len); + if (unlikely(xdp_data_meta_unsupported(xdp))) { + from_buf = xdp->data; + metalen = 0; + } else { + from_buf = xdp->data_meta; + metalen = xdp->data - xdp->data_meta; + } + + to_buf = xdp_umem_get_data(xs->umem, addr); + memcpy(to_buf, from_buf, len + metalen); + addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); @@ -111,6 +121,7 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 metalen = xdp->data - xdp->data_meta; u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; @@ -120,7 +131,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -EINVAL; if (!xskq_peek_addr(xs->umem->fq, &addr) || - len > xs->umem->chunk_size_nohr) { + len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } @@ -128,7 +139,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) addr += xs->umem->headroom; buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data, len); + memcpy(buffer, xdp->data_meta, len + metalen); + addr += metalen; err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); @@ -343,12 +355,18 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { + struct net_device *dev = xs->dev; + /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - dev_put(xs->dev); + xdp_del_sk_umem(xs->umem, xs); xs->dev = NULL; + synchronize_net(); + dev_put(dev); } + xskq_destroy(xs->rx); + xskq_destroy(xs->tx); + sock_orphan(sk); sock->sk = NULL; @@ -407,13 +425,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } qid = sxdp->sxdp_queue_id; - - if ((xs->rx && qid >= dev->real_num_rx_queues) || - (xs->tx && qid >= dev->real_num_tx_queues)) { - err = -EINVAL; - goto out_unlock; - } - flags = sxdp->sxdp_flags; if (flags & XDP_SHARED_UMEM) { @@ -458,8 +469,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, &xs->umem->props); - xskq_set_umem(xs->umem->cq, &xs->umem->props); + xskq_set_umem(xs->umem->fq, xs->umem->size, + xs->umem->chunk_mask); + xskq_set_umem(xs->umem->cq, xs->umem->size, + xs->umem->chunk_mask); err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) @@ -469,8 +482,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, &xs->umem->props); - xskq_set_umem(xs->tx, &xs->umem->props); + xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); + xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: @@ -707,9 +720,6 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xskq_destroy(xs->rx); - xskq_destroy(xs->tx); - xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); @@ -744,6 +754,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_destruct = xsk_destruct; sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); + xs = xdp_sk(sk); mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 6c32e92e98fc..b66504592d9b 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -3,16 +3,19 @@ * Copyright(c) 2018 Intel Corporation. */ +#include #include +#include #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) { if (!q) return; - q->umem_props = *umem_props; + q->size = size; + q->chunk_mask = chunk_mask; } static u32 xskq_umem_get_ring_size(struct xsk_queue *q) @@ -61,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } + +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + struct xdp_umem_fq_reuse *newq; + + /* Check for overflow */ + if (nentries > (u32)roundup_pow_of_two(nentries)) + return NULL; + nentries = roundup_pow_of_two(nentries); + + newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); + if (!newq) + return NULL; + memset(newq, 0, offsetof(typeof(*newq), handles)); + + newq->nentries = nentries; + return newq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); + +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq) +{ + struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; + + if (!oldq) { + umem->fq_reuse = newq; + return NULL; + } + + if (newq->nentries < oldq->length) + return newq; + + memcpy(newq->handles, oldq->handles, + array_size(oldq->length, sizeof(u64))); + newq->length = oldq->length; + + umem->fq_reuse = newq; + return oldq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_swap); + +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ + kvfree(rq); +} +EXPORT_SYMBOL_GPL(xsk_reuseq_free); + +void xsk_reuseq_destroy(struct xdp_umem *umem) +{ + xsk_reuseq_free(umem->fq_reuse); + umem->fq_reuse = NULL; +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 8a64b150be54..bcb5cbb40419 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -31,7 +31,8 @@ struct xdp_umem_ring { }; struct xsk_queue { - struct xdp_umem_props umem_props; + u64 chunk_mask; + u64 size; u32 ring_mask; u32 nentries; u32 prod_head; @@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (addr >= q->umem_props.size) { + if (addr >= q->size) { q->invalid_descs++; return false; } @@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; + *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; if (xskq_is_valid_addr(q, *addr)) return addr; @@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) if (!xskq_is_valid_addr(q, d->addr)) return false; - if (((d->addr + d->len) & q->umem_props.chunk_mask) != - (d->addr & q->umem_props.chunk_mask)) { + if (((d->addr + d->len) & q->chunk_mask) != + (d->addr & q->chunk_mask)) { q->invalid_descs++; return false; } @@ -253,8 +254,11 @@ static inline bool xskq_empty_desc(struct xsk_queue *q) return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; } -void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); +void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); +/* Executed by the core when the entire UMEM gets freed */ +void xsk_reuseq_destroy(struct xdp_umem *umem); + #endif /* _LINUX_XSK_QUEUE_H */ diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 4a9ee2d83158..140270a13d54 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -8,7 +8,6 @@ config XFRM config XFRM_OFFLOAD bool - depends on XFRM config XFRM_ALGO tristate diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5611b7521020..144c137886b1 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur do { struct sk_buff *nskb = skb2->next; - skb2->next = NULL; + skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; @@ -192,9 +192,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { + xso->num_exthdrs = 0; + xso->flags = 0; xso->dev = NULL; dev_put(dev); - return err; + + if (err != -EOPNOTSUPP) + return err; } return 0; diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index 2ad33ce1ea17..eca8d84d99bf 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 61be810389d8..ce66323102f9 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -13,7 +13,7 @@ static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) { - return ntohl(addr->a6[2] ^ addr->a6[3]); + return jhash2((__force u32 *)addr->a6, 4, 0); } static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, @@ -26,8 +26,7 @@ static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { - return ntohl(daddr->a6[2] ^ daddr->a6[3] ^ - saddr->a6[2] ^ saddr->a6[3]); + return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr); } static inline u32 __bits2mask32(__u8 bits) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b89c9c7f8c5c..684c0bc01e2c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -131,7 +131,7 @@ struct sec_path *secpath_dup(struct sec_path *src) sp->len = 0; sp->olen = 0; - memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH])); + memset(sp->ovec, 0, sizeof(sp->ovec)); if (src) { int i; @@ -458,6 +458,7 @@ resume: XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31acc6f33d98..d679fa0f44b3 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -116,6 +116,9 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) static void xfrmi_dev_free(struct net_device *dev) { + struct xfrm_if *xi = netdev_priv(dev); + + gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } @@ -469,9 +472,9 @@ static int xfrmi4_err(struct sk_buff *skb, u32 info) } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + ipv4_update_pmtu(skb, net, info, 0, protocol); else - ipv4_redirect(skb, net, 0, 0, protocol, 0); + ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; @@ -561,9 +564,6 @@ static void xfrmi_get_stats64(struct net_device *dev, { int cpu; - if (!dev->tstats) - return; - for_each_possible_cpu(cpu) { struct pcpu_sw_netstats *stats; struct pcpu_sw_netstats tmp; @@ -742,7 +742,7 @@ nla_put_failure: return -EMSGSIZE; } -struct net *xfrmi_get_link_net(const struct net_device *dev) +static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 45ba07ab3e4f..4ae87c5ce2e3 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -100,6 +100,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } if (xfrm_offload(skb)) { x->type_offload->encap(x, skb); @@ -189,7 +193,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb struct sk_buff *nskb = segs->next; int err; - segs->next = NULL; + skb_mark_not_on_list(segs); err = xfrm_output2(net, sk, segs); if (unlikely(err)) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 3110c3fbee20..119a427d9b2b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -632,9 +632,9 @@ static void xfrm_hash_rebuild(struct work_struct *work) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -774,9 +774,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ @@ -2491,6 +2491,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); + return 0; + } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b669262682c9..dc4a9f1fb941 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2077,10 +2077,8 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif if (!optval && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4791aa8b8185..c9a84e22f5d5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -151,10 +151,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; switch (p->family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + goto out; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + goto out; + break; #else err = -EAFNOSUPPORT; @@ -1001,7 +1007,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) int err; err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, - NULL); + cb->extack); if (err < 0) return err; @@ -1396,10 +1402,16 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) switch (p->sel.family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + return -EINVAL; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + return -EINVAL; + break; #else return -EAFNOSUPPORT; @@ -1480,6 +1492,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) (ut[i].family != prev_family)) return -EINVAL; + if (ut[i].mode >= XFRM_MODE_MAX) + return -EINVAL; + prev_family = ut[i].family; switch (ut[i].family) { @@ -2606,10 +2621,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, const struct xfrm_link *link; int type, err; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif type = nlh->nlmsg_type; if (type > XFRM_MSG_MAX) diff --git a/samples/Kconfig b/samples/Kconfig index bd133efc1a56..ad1ec7016d4c 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -1,5 +1,6 @@ menuconfig SAMPLES bool "Sample kernel code" + depends on !UML help You can build and test sample kernel code here. diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 36f9f41d094b..be0a961450bc 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -153,6 +153,7 @@ always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o always += tcp_basertt_kern.o +always += tcp_tos_reflect_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 904e775d1a44..e6d7e0fe155b 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 60c2b73d1b4d..216c7ecbbbe9 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -9,7 +9,6 @@ */ #include #include -#include #include #include #include diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index f58acfc92556..f2f9dbc021b0 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -14,7 +14,7 @@ struct vlan_hdr { __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { __u64 verlen; @@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto } static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { *ip_proto = load_byte(skb, nhoff + offsetof(struct ipv6hdr, nexthdr)); @@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro return nhoff; } -static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys *flow) +static inline bool flow_dissector(struct __sk_buff *skb, + struct flow_key_record *flow) { __u64 nhoff = ETH_HLEN; __u64 ip_proto; @@ -198,7 +199,7 @@ struct bpf_map_def SEC("maps") hash_map = { SEC("socket2") int bpf_prog2(struct __sk_buff *skb) { - struct bpf_flow_keys flow = {}; + struct flow_key_record flow = {}; struct pair *value; u32 key; diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 95907f8d2b17..c527b57d3ec8 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -61,7 +61,7 @@ struct vlan_hdr { __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } struct globals { - struct bpf_flow_keys flow; + struct flow_key_record flow; }; struct bpf_map_def SEC("maps") percpu_map = { @@ -114,14 +114,14 @@ struct pair { struct bpf_map_def SEC("maps") hash_map = { .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct bpf_flow_keys), + .key_size = sizeof(struct flow_key_record), .value_size = sizeof(struct pair), .max_entries = 1024, }; static void update_stats(struct __sk_buff *skb, struct globals *g) { - struct bpf_flow_keys key = g->flow; + struct flow_key_record key = g->flow; struct pair *value; value = bpf_map_lookup_elem(&hash_map, &key); diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 5ba3ae9d180b..9d02e0404719 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -13,7 +13,7 @@ #define PARSE_IP_PROG_FD (prog_fd[0]) #define PROG_ARRAY_FD (map_fd[0]) -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -64,7 +64,7 @@ int main(int argc, char **argv) (void) f; for (i = 0; i < 5; i++) { - struct bpf_flow_keys key = {}, next_key; + struct flow_key_record key = {}, next_key; struct pair value; sleep(1); diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c new file mode 100644 index 000000000000..d51dab19eca6 --- /dev/null +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + * + * BPF program to automatically reflect TOS option from received syn packet + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char header[sizeof(struct ipv6hdr)]; + struct ipv6hdr *hdr6; + struct iphdr *hdr; + int hdr_size = 0; + int save_syn = 1; + int tos = 0; + int rv = 0; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + switch (op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, + &save_syn, sizeof(save_syn)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (skops->family == AF_INET) + hdr_size = sizeof(struct iphdr); + else + hdr_size = sizeof(struct ipv6hdr); + rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN, + header, hdr_size); + if (!rv) { + if (skops->family == AF_INET) { + hdr = (struct iphdr *) header; + tos = hdr->tos; + if (tos != 0) + bpf_setsockopt(skops, SOL_IP, IP_TOS, + &tos, sizeof(tos)); + } else { + hdr6 = (struct ipv6hdr *) header; + tos = ((hdr6->priority) << 4 | + (hdr6->flow_lbl[0]) >> 4); + if (tos) + bpf_setsockopt(skops, SOL_IPV6, + IPV6_TCLASS, + &tos, sizeof(tos)); + } + rv = 0; + } + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 180f9d813bca..d7b68ef5ba79 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -209,7 +209,7 @@ static int map_fd = -1; static int prog_load_cnt(int verdict, int val) { - int cgroup_storage_fd; + int cgroup_storage_fd, percpu_cgroup_storage_fd; if (map_fd < 0) map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); @@ -225,6 +225,14 @@ static int prog_load_cnt(int verdict, int val) return -1; } + percpu_cgroup_storage_fd = bpf_create_map( + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + if (percpu_cgroup_storage_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + struct bpf_insn prog[] = { BPF_MOV32_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ @@ -235,11 +243,20 @@ static int prog_load_cnt(int verdict, int val) BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, val), BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), + + BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), }; diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 4be4874ca2bc..2259f997a26c 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -11,7 +11,6 @@ #include #include #include "bpf_load.h" -#include #include "cgroup_helpers.h" #define CGROUP_PATH "/my-cgroup" diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 6c6b10f4c3ee..56466d010139 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -17,8 +17,6 @@ #include "bpf_load.h" #include "bpf_util.h" -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) - #define SLOTS 100 static void clear_stats(int fd) diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c index d8806c41362e..b8ccd0802b3f 100644 --- a/samples/bpf/xdpsock_kern.c +++ b/samples/bpf/xdpsock_kern.c @@ -16,7 +16,7 @@ struct bpf_map_def SEC("maps") xsks_map = { .type = BPF_MAP_TYPE_XSKMAP, .key_size = sizeof(int), .value_size = sizeof(int), - .max_entries = 4, + .max_entries = MAX_SOCKS, }; struct bpf_map_def SEC("maps") rr_map = { diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4914788b6727..57ecadc58403 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -118,7 +118,6 @@ struct xdpsock { unsigned long prev_tx_npkts; }; -#define MAX_SOCKS 4 static int num_socks; struct xdpsock *xsks[MAX_SOCKS]; @@ -596,7 +595,7 @@ static void dump_stats(void) prev_time = now; - for (i = 0; i < num_socks; i++) { + for (i = 0; i < num_socks && xsks[i]; i++) { char *fmt = "%-15s %'-11.0f %'-11lu\n"; double rx_pps, tx_pps; @@ -649,6 +648,8 @@ static struct option long_options[] = { {"xdp-skb", no_argument, 0, 'S'}, {"xdp-native", no_argument, 0, 'N'}, {"interval", required_argument, 0, 'n'}, + {"zero-copy", no_argument, 0, 'z'}, + {"copy", no_argument, 0, 'c'}, {0, 0, 0, 0} }; @@ -667,6 +668,8 @@ static void usage(const char *prog) " -S, --xdp-skb=n Use XDP skb-mod\n" " -N, --xdp-native=n Enfore XDP native mode\n" " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + " -z, --zero-copy Force zero-copy mode.\n" + " -c, --copy Force copy mode.\n" "\n"; fprintf(stderr, str, prog); exit(EXIT_FAILURE); @@ -679,7 +682,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options, + c = getopt_long(argc, argv, "rtli:q:psSNn:cz", long_options, &option_index); if (c == -1) break; @@ -716,6 +719,12 @@ static void parse_command_line(int argc, char **argv) case 'n': opt_interval = atoi(optarg); break; + case 'z': + opt_xdp_bind_flags |= XDP_ZEROCOPY; + break; + case 'c': + opt_xdp_bind_flags |= XDP_COPY; + break; default: usage(basename(argv[0])); } diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c index 57d0d871dcf7..33e67bd1dc34 100644 --- a/samples/mei/mei-amt-version.c +++ b/samples/mei/mei-amt-version.c @@ -370,7 +370,7 @@ static uint32_t amt_host_if_call(struct amt_host_if *acmd, unsigned int expected_sz) { uint32_t in_buf_sz; - uint32_t out_buf_sz; + ssize_t out_buf_sz; ssize_t written; uint32_t status; struct amt_host_if_resp_header *msg_hdr; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 2535c3677c7b..ca7960adf5a3 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -71,11 +71,19 @@ #define MBOCHS_NAME "mbochs" #define MBOCHS_CLASS_NAME "mbochs" +#define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS +#define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1) + #define MBOCHS_CONFIG_SPACE_SIZE 0xff #define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE #define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE -#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \ +#define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \ MBOCHS_MMIO_BAR_SIZE) +#define MBOCHS_EDID_SIZE PAGE_SIZE +#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \ + MBOCHS_EDID_SIZE) + +#define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2) #define STORE_LE16(addr, val) (*(u16 *)addr = val) #define STORE_LE32(addr, val) (*(u32 *)addr = val) @@ -95,16 +103,24 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); static const struct mbochs_type { const char *name; u32 mbytes; + u32 max_x; + u32 max_y; } mbochs_types[] = { { .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, + .max_x = 800, + .max_y = 600, }, { .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, + .max_x = 1920, + .max_y = 1440, }, { .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, + .max_x = 0, + .max_y = 0, }, }; @@ -115,6 +131,11 @@ static struct cdev mbochs_cdev; static struct device mbochs_dev; static int mbochs_used_mbytes; +struct vfio_region_info_ext { + struct vfio_region_info base; + struct vfio_region_info_cap_type type; +}; + struct mbochs_mode { u32 drm_format; u32 bytepp; @@ -144,13 +165,14 @@ struct mdev_state { u32 memory_bar_mask; struct mutex ops_lock; struct mdev_device *mdev; - struct vfio_device_info dev_info; const struct mbochs_type *type; u16 vbe[VBE_DISPI_INDEX_COUNT]; u64 memsize; struct page **pages; pgoff_t pagecount; + struct vfio_region_gfx_edid edid_regs; + u8 edid_blob[0x400]; struct list_head dmabufs; u32 active_id; @@ -342,10 +364,20 @@ static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset, char *buf, u32 count) { struct device *dev = mdev_dev(mdev_state->mdev); + struct vfio_region_gfx_edid *edid; u16 reg16 = 0; int index; switch (offset) { + case 0x000 ... 0x3ff: /* edid block */ + edid = &mdev_state->edid_regs; + if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP || + offset >= edid->edid_size) { + memset(buf, 0, count); + break; + } + memcpy(buf, mdev_state->edid_blob + offset, count); + break; case 0x500 ... 0x515: /* bochs dispi interface */ if (count != 2) goto unhandled; @@ -365,6 +397,44 @@ unhandled: } } +static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + char *regs = (void *)&mdev_state->edid_regs; + + if (offset + count > sizeof(mdev_state->edid_regs)) + return; + if (count != 4) + return; + if (offset % 4) + return; + + if (is_write) { + switch (offset) { + case offsetof(struct vfio_region_gfx_edid, link_state): + case offsetof(struct vfio_region_gfx_edid, edid_size): + memcpy(regs + offset, buf, count); + break; + default: + /* read-only regs */ + break; + } + } else { + memcpy(buf, regs + offset, count); + } +} + +static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + if (offset + count > mdev_state->edid_regs.edid_max_size) + return; + if (is_write) + memcpy(mdev_state->edid_blob + offset, buf, count); + else + memcpy(buf, mdev_state->edid_blob + offset, count); +} + static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, loff_t pos, bool is_write) { @@ -384,13 +454,25 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, memcpy(buf, (mdev_state->vconfig + pos), count); } else if (pos >= MBOCHS_MMIO_BAR_OFFSET && - pos + count <= MBOCHS_MEMORY_BAR_OFFSET) { + pos + count <= (MBOCHS_MMIO_BAR_OFFSET + + MBOCHS_MMIO_BAR_SIZE)) { pos -= MBOCHS_MMIO_BAR_OFFSET; if (is_write) handle_mmio_write(mdev_state, pos, buf, count); else handle_mmio_read(mdev_state, pos, buf, count); + } else if (pos >= MBOCHS_EDID_OFFSET && + pos + count <= (MBOCHS_EDID_OFFSET + + MBOCHS_EDID_SIZE)) { + pos -= MBOCHS_EDID_OFFSET; + if (pos < MBOCHS_EDID_BLOB_OFFSET) { + handle_edid_regs(mdev_state, pos, buf, count, is_write); + } else { + pos -= MBOCHS_EDID_BLOB_OFFSET; + handle_edid_blob(mdev_state, pos, buf, count, is_write); + } + } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET && pos + count <= MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) { @@ -471,6 +553,10 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) mdev_state->next_id = 1; mdev_state->type = type; + mdev_state->edid_regs.max_xres = type->max_x; + mdev_state->edid_regs.max_yres = type->max_y; + mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET; + mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob); mbochs_create_config_space(mdev_state); mbochs_reset(mdev); @@ -932,16 +1018,16 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) } static int mbochs_get_region_info(struct mdev_device *mdev, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) + struct vfio_region_info_ext *ext) { + struct vfio_region_info *region_info = &ext->base; struct mdev_state *mdev_state; mdev_state = mdev_get_drvdata(mdev); if (!mdev_state) return -EINVAL; - if (region_info->index >= VFIO_PCI_NUM_REGIONS) + if (region_info->index >= MBOCHS_NUM_REGIONS) return -EINVAL; switch (region_info->index) { @@ -964,6 +1050,20 @@ static int mbochs_get_region_info(struct mdev_device *mdev, region_info->flags = (VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE); break; + case MBOCHS_EDID_REGION_INDEX: + ext->base.argsz = sizeof(*ext); + ext->base.offset = MBOCHS_EDID_OFFSET; + ext->base.size = MBOCHS_EDID_SIZE; + ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + ext->base.cap_offset = offsetof(typeof(*ext), type); + ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; + ext->type.header.version = 1; + ext->type.header.next = 0; + ext->type.type = VFIO_REGION_TYPE_GFX; + ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; + break; default: region_info->size = 0; region_info->offset = 0; @@ -984,7 +1084,7 @@ static int mbochs_get_device_info(struct mdev_device *mdev, struct vfio_device_info *dev_info) { dev_info->flags = VFIO_DEVICE_FLAGS_PCI; - dev_info->num_regions = VFIO_PCI_NUM_REGIONS; + dev_info->num_regions = MBOCHS_NUM_REGIONS; dev_info->num_irqs = VFIO_PCI_NUM_IRQS; return 0; } @@ -1084,7 +1184,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) { int ret = 0; - unsigned long minsz; + unsigned long minsz, outsz; struct mdev_state *mdev_state; mdev_state = mdev_get_drvdata(mdev); @@ -1106,8 +1206,6 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, if (ret) return ret; - memcpy(&mdev_state->dev_info, &info, sizeof(info)); - if (copy_to_user((void __user *)arg, &info, minsz)) return -EFAULT; @@ -1115,24 +1213,24 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, } case VFIO_DEVICE_GET_REGION_INFO: { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; + struct vfio_region_info_ext info; - minsz = offsetofend(struct vfio_region_info, offset); + minsz = offsetofend(typeof(info), base.offset); if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; - if (info.argsz < minsz) + outsz = info.base.argsz; + if (outsz < minsz) + return -EINVAL; + if (outsz > sizeof(info)) return -EINVAL; - ret = mbochs_get_region_info(mdev, &info, &cap_type_id, - &cap_type); + ret = mbochs_get_region_info(mdev, &info); if (ret) return ret; - if (copy_to_user((void __user *)arg, &info, minsz)) + if (copy_to_user((void __user *)arg, &info, outsz)) return -EFAULT; return 0; @@ -1148,7 +1246,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, return -EFAULT; if ((info.argsz < minsz) || - (info.index >= mdev_state->dev_info.num_irqs)) + (info.index >= VFIO_PCI_NUM_IRQS)) return -EINVAL; ret = mbochs_get_irq_info(mdev, &info); diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index ce53639a864a..bb015551c2d9 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -115,7 +115,9 @@ __cc-option = $(call try-run,\ # Do not attempt to build with gcc plugins during cc-option tests. # (And this uses delayed resolution so the flags will be up to date.) -CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) +# In addition, do not include the asm macros which are built later. +CC_OPTION_FILTERED = $(GCC_PLUGINS_CFLAGS) $(ASM_MACRO_FLAGS) +CC_OPTION_CFLAGS = $(filter-out $(CC_OPTION_FILTERED),$(KBUILD_CFLAGS)) # cc-option # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) @@ -138,17 +140,9 @@ cc-option-yn = $(call try-run,\ cc-disable-warning = $(call try-run,\ $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) -# cc-name -# Expands to either gcc or clang -cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc) - # cc-version cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) -# cc-fullversion -cc-fullversion = $(shell $(CONFIG_SHELL) \ - $(srctree)/scripts/gcc-version.sh -p $(CC)) - # cc-ifversion # Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3) || echo $(4)) @@ -193,7 +187,7 @@ modbuiltin := -f $(srctree)/scripts/Makefile.modbuiltin obj # Shorthand for $(Q)$(MAKE) -f scripts/Makefile.dtbinst obj= # Usage: # $(Q)$(MAKE) $(dtbinst)=dir -dtbinst := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.dtbinst obj +dtbinst := -f $(srctree)/scripts/Makefile.dtbinst obj ### # Shorthand for $(Q)$(MAKE) -f scripts/Makefile.clean obj= diff --git a/scripts/Makefile b/scripts/Makefile index 61affa300d25..ece52ff20171 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -39,8 +39,7 @@ build_unifdef: $(obj)/unifdef subdir-$(CONFIG_MODVERSIONS) += genksyms subdir-y += mod subdir-$(CONFIG_SECURITY_SELINUX) += selinux -subdir-$(CONFIG_DTC) += dtc subdir-$(CONFIG_GDB_SCRIPTS) += gdb # Let clean descend into subdirs -subdir- += basic kconfig package gcc-plugins +subdir- += basic dtc kconfig package gcc-plugins diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 5a2d1c9578a0..a8e7ba9f73e8 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -36,21 +36,11 @@ subdir-ccflags-y := include scripts/Kbuild.include -# For backward compatibility check that these variables do not change -save-cflags := $(CFLAGS) - # The filename Kbuild has precedence over Makefile kbuild-dir := $(if $(filter /%,$(src)),$(src),$(srctree)/$(src)) kbuild-file := $(if $(wildcard $(kbuild-dir)/Kbuild),$(kbuild-dir)/Kbuild,$(kbuild-dir)/Makefile) include $(kbuild-file) -# If the save-* variables changed error out -ifeq ($(KBUILD_NOPEDANTIC),) - ifneq ("$(save-cflags)","$(CFLAGS)") - $(error CFLAGS was changed in "$(kbuild-file)". Fix it to use ccflags-y) - endif -endif - include scripts/Makefile.lib # Do not include host rules unless needed @@ -83,14 +73,12 @@ __build: $(if $(KBUILD_BUILTIN),$(builtin-target) $(lib-target) $(extra-y)) \ @: # Linus' kernel sanity checking tool -ifneq ($(KBUILD_CHECKSRC),0) - ifeq ($(KBUILD_CHECKSRC),2) - quiet_cmd_force_checksrc = CHECK $< - cmd_force_checksrc = $(CHECK) $(CHECKFLAGS) $(c_flags) $< ; - else - quiet_cmd_checksrc = CHECK $< - cmd_checksrc = $(CHECK) $(CHECKFLAGS) $(c_flags) $< ; - endif +ifeq ($(KBUILD_CHECKSRC),1) + quiet_cmd_checksrc = CHECK $< + cmd_checksrc = $(CHECK) $(CHECKFLAGS) $(c_flags) $< ; +else ifeq ($(KBUILD_CHECKSRC),2) + quiet_cmd_force_checksrc = CHECK $< + cmd_force_checksrc = $(CHECK) $(CHECKFLAGS) $(c_flags) $< ; endif ifneq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),) @@ -219,7 +207,7 @@ else sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \ "$(if $(CONFIG_64BIT),64,32)" \ - "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \ + "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)" \ "$(LD) $(KBUILD_LDFLAGS)" "$(NM)" "$(RM)" "$(MV)" \ "$(if $(part-of-module),1,0)" "$(@)"; recordmcount_source := $(srctree)/scripts/recordmcount.pl @@ -495,28 +483,12 @@ targets += $(obj)/lib-ksyms.o endif -# -# Rule to link composite objects -# -# Composite objects are specified in kbuild makefile as follows: -# -objs := -# or -# -y := -# or -# -m := -# The -m syntax only works if is a module -link_multi_deps = \ -$(filter $(addprefix $(obj)/, \ -$($(subst $(obj)/,,$(@:.o=-objs))) \ -$($(subst $(obj)/,,$(@:.o=-y))) \ -$($(subst $(obj)/,,$(@:.o=-m)))), $^) - quiet_cmd_link_multi-m = LD [M] $@ -cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(link_multi_deps) $(cmd_secanalysis) +cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(filter %.o,$^) $(cmd_secanalysis) $(multi-used-m): FORCE $(call if_changed,link_multi-m) - @{ echo $(@:.o=.ko); echo $(link_multi_deps); \ + @{ echo $(@:.o=.ko); echo $(filter %.o,$^); \ $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod) $(call multi_depend, $(multi-used-m), .o, -objs -y -m) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 8d5357053f86..768306add591 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -29,6 +29,7 @@ warning-1 += $(call cc-option, -Wmissing-include-dirs) warning-1 += $(call cc-option, -Wunused-but-set-variable) warning-1 += $(call cc-option, -Wunused-const-variable) warning-1 += $(call cc-option, -Wpacked-not-aligned) +warning-1 += $(call cc-option, -Wstringop-truncation) warning-1 += $(call cc-disable-warning, missing-field-initializers) warning-1 += $(call cc-disable-warning, sign-compare) @@ -52,7 +53,6 @@ warning-3 += -Wpointer-arith warning-3 += -Wredundant-decls warning-3 += -Wswitch-default warning-3 += $(call cc-option, -Wpacked-bitfield-compat) -warning-3 += $(call cc-option, -Wvla) warning := $(warning-$(findstring 1, $(KBUILD_ENABLE_EXTRA_GCC_CHECKS))) warning += $(warning-$(findstring 2, $(KBUILD_ENABLE_EXTRA_GCC_CHECKS))) @@ -65,7 +65,7 @@ endif KBUILD_CFLAGS += $(warning) else -ifeq ($(cc-name),clang) +ifdef CONFIG_CC_IS_CLANG KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides) KBUILD_CFLAGS += $(call cc-disable-warning, unused-value) KBUILD_CFLAGS += $(call cc-disable-warning, format) diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 0a482f341576..46c5c6809806 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -26,6 +26,16 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT) \ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT_PERFORMANCE) \ += -fplugin-arg-randomize_layout_plugin-performance-mode +gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak_plugin.so +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -DSTACKLEAK_PLUGIN +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE) +ifdef CONFIG_GCC_PLUGIN_STACKLEAK + DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable +endif +export DISABLE_STACKLEAK_PLUGIN + # All the plugin CFLAGS are collected here in case a build target needs to # filter them out of the KBUILD_CFLAGS. GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y)) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 61e596650ed3..8fe4468f9bda 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -283,7 +283,7 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE quiet_cmd_dtc = DTC $@ cmd_dtc = mkdir -p $(dir ${dtc-tmp}) ; \ - $(CPP) $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) $< ; \ + $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) $< ; \ $(DTC) -O dtb -o $@ -b 0 \ $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \ -d $(depfile).dtc.tmp $(dtc-tmp) ; \ diff --git a/scripts/asn1_compiler.c b/scripts/asn1_compiler.c index c146020fc783..1b28787028d3 100644 --- a/scripts/asn1_compiler.c +++ b/scripts/asn1_compiler.c @@ -413,7 +413,7 @@ static void tokenise(char *buffer, char *end) /* Handle string tokens */ if (isalpha(*p)) { - const char **dir, *start = p; + const char **dir; /* Can be a directive, type name or element * name. Find the end of the name. diff --git a/scripts/check_00index.sh b/scripts/check_00index.sh deleted file mode 100755 index aa47f5926c80..000000000000 --- a/scripts/check_00index.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -cd Documentation/ - -# Check entries that should be removed - -obsolete="" -for i in $(tail -n +12 00-INDEX |grep -E '^[a-zA-Z0-9]+'); do - if [ ! -e $i ]; then - obsolete="$obsolete $i" - fi -done - -# Check directory entries that should be added -search="" -dir="" -for i in $(find . -maxdepth 1 -type d); do - if [ "$i" != "." ]; then - new=$(echo $i|perl -ne 's,./(.*),$1/,; print $_') - search="$search $new" - fi -done - -for i in $search; do - if [ "$(grep -P "^$i" 00-INDEX)" == "" ]; then - dir="$dir $i" - fi -done - -# Check file entries that should be added -search="" -file="" -for i in $(find . -maxdepth 1 -type f); do - if [ "$i" != "./.gitignore" ]; then - new=$(echo $i|perl -ne 's,./(.*),$1,; print $_') - search="$search $new" - fi -done - -for i in $search; do - if [ "$(grep -P "^$i\$" 00-INDEX)" == "" ]; then - file="$file $i" - fi -done - -# Output its findings - -echo -e "Documentation/00-INDEX check results:\n" - -if [ "$obsolete" != "" ]; then - echo -e "- Should remove those entries:\n\t$obsolete\n" -else - echo -e "- No obsolete entries\n" -fi - -if [ "$dir" != "" ]; then - echo -e "- Should document those directories:\n\t$dir\n" -else - echo -e "- No new directories to add\n" -fi - -if [ "$file" != "" ]; then - echo -e "- Should document those files:\n\t$file" -else - echo "- No new files to add" -fi diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 161b0224d6ae..c883ec55654f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4934,17 +4934,6 @@ sub process { while ($line =~ m{($Constant|$Lval)}g) { my $var = $1; -#gcc binary extension - if ($var =~ /^$Binary$/) { - if (WARN("GCC_BINARY_CONSTANT", - "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && - $fix) { - my $hexval = sprintf("0x%x", oct($var)); - $fixed[$fixlinenr] =~ - s/\b$var\b/$hexval/; - } - } - #CamelCase if ($var !~ /^$Constant$/ && $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && diff --git a/scripts/dtc/Makefile b/scripts/dtc/Makefile index 1c943e03eaf2..056d5da6c477 100644 --- a/scripts/dtc/Makefile +++ b/scripts/dtc/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # scripts/dtc makefile -hostprogs-y := dtc +hostprogs-$(CONFIG_DTC) := dtc always := $(hostprogs-y) dtc-objs := dtc.o flattree.o fstree.o data.o livetree.o treesource.o \ @@ -11,6 +11,13 @@ dtc-objs += dtc-lexer.lex.o dtc-parser.tab.o # Source files need to get at the userspace version of libfdt_env.h to compile HOST_EXTRACFLAGS := -I$(src)/libfdt +ifeq ($(wildcard /usr/include/yaml.h),) +HOST_EXTRACFLAGS += -DNO_YAML +else +dtc-objs += yamltree.o +HOSTLDLIBS_dtc := -lyaml +endif + # Generated files need one more search path to include headers in source tree HOSTCFLAGS_dtc-lexer.lex.o := -I$(src) HOSTCFLAGS_dtc-parser.tab.o := -I$(src) diff --git a/scripts/dtc/Makefile.dtc b/scripts/dtc/Makefile.dtc index bece49b35535..d4375630a7f7 100644 --- a/scripts/dtc/Makefile.dtc +++ b/scripts/dtc/Makefile.dtc @@ -14,5 +14,9 @@ DTC_SRCS = \ treesource.c \ util.c +ifneq ($(NO_YAML),1) +DTC_SRCS += yamltree.c +endif + DTC_GEN_SRCS = dtc-lexer.lex.c dtc-parser.tab.c DTC_OBJS = $(DTC_SRCS:%.c=%.o) $(DTC_GEN_SRCS:%.c=%.o) diff --git a/scripts/dtc/checks.c b/scripts/dtc/checks.c index a2cc1036c915..9c9b0c328af6 100644 --- a/scripts/dtc/checks.c +++ b/scripts/dtc/checks.c @@ -962,6 +962,143 @@ static void check_simple_bus_reg(struct check *c, struct dt_info *dti, struct no } WARNING(simple_bus_reg, check_simple_bus_reg, NULL, ®_format, &simple_bus_bridge); +static const struct bus_type i2c_bus = { + .name = "i2c-bus", +}; + +static void check_i2c_bus_bridge(struct check *c, struct dt_info *dti, struct node *node) +{ + if (strprefixeq(node->name, node->basenamelen, "i2c-bus") || + strprefixeq(node->name, node->basenamelen, "i2c-arb")) { + node->bus = &i2c_bus; + } else if (strprefixeq(node->name, node->basenamelen, "i2c")) { + struct node *child; + for_each_child(node, child) { + if (strprefixeq(child->name, node->basenamelen, "i2c-bus")) + return; + } + node->bus = &i2c_bus; + } else + return; + + if (!node->children) + return; + + if (node_addr_cells(node) != 1) + FAIL(c, dti, node, "incorrect #address-cells for I2C bus"); + if (node_size_cells(node) != 0) + FAIL(c, dti, node, "incorrect #size-cells for I2C bus"); + +} +WARNING(i2c_bus_bridge, check_i2c_bus_bridge, NULL, &addr_size_cells); + +static void check_i2c_bus_reg(struct check *c, struct dt_info *dti, struct node *node) +{ + struct property *prop; + const char *unitname = get_unitname(node); + char unit_addr[17]; + uint32_t reg = 0; + int len; + cell_t *cells = NULL; + + if (!node->parent || (node->parent->bus != &i2c_bus)) + return; + + prop = get_property(node, "reg"); + if (prop) + cells = (cell_t *)prop->val.val; + + if (!cells) { + FAIL(c, dti, node, "missing or empty reg property"); + return; + } + + reg = fdt32_to_cpu(*cells); + snprintf(unit_addr, sizeof(unit_addr), "%x", reg); + if (!streq(unitname, unit_addr)) + FAIL(c, dti, node, "I2C bus unit address format error, expected \"%s\"", + unit_addr); + + for (len = prop->val.len; len > 0; len -= 4) { + reg = fdt32_to_cpu(*(cells++)); + if (reg > 0x3ff) + FAIL_PROP(c, dti, node, prop, "I2C address must be less than 10-bits, got \"0x%x\"", + reg); + + } +} +WARNING(i2c_bus_reg, check_i2c_bus_reg, NULL, ®_format, &i2c_bus_bridge); + +static const struct bus_type spi_bus = { + .name = "spi-bus", +}; + +static void check_spi_bus_bridge(struct check *c, struct dt_info *dti, struct node *node) +{ + + if (strprefixeq(node->name, node->basenamelen, "spi")) { + node->bus = &spi_bus; + } else { + /* Try to detect SPI buses which don't have proper node name */ + struct node *child; + + if (node_addr_cells(node) != 1 || node_size_cells(node) != 0) + return; + + for_each_child(node, child) { + struct property *prop; + for_each_property(child, prop) { + if (strprefixeq(prop->name, 4, "spi-")) { + node->bus = &spi_bus; + break; + } + } + if (node->bus == &spi_bus) + break; + } + + if (node->bus == &spi_bus && get_property(node, "reg")) + FAIL(c, dti, node, "node name for SPI buses should be 'spi'"); + } + if (node->bus != &spi_bus || !node->children) + return; + + if (node_addr_cells(node) != 1) + FAIL(c, dti, node, "incorrect #address-cells for SPI bus"); + if (node_size_cells(node) != 0) + FAIL(c, dti, node, "incorrect #size-cells for SPI bus"); + +} +WARNING(spi_bus_bridge, check_spi_bus_bridge, NULL, &addr_size_cells); + +static void check_spi_bus_reg(struct check *c, struct dt_info *dti, struct node *node) +{ + struct property *prop; + const char *unitname = get_unitname(node); + char unit_addr[9]; + uint32_t reg = 0; + cell_t *cells = NULL; + + if (!node->parent || (node->parent->bus != &spi_bus)) + return; + + prop = get_property(node, "reg"); + if (prop) + cells = (cell_t *)prop->val.val; + + if (!cells) { + FAIL(c, dti, node, "missing or empty reg property"); + return; + } + + reg = fdt32_to_cpu(*cells); + snprintf(unit_addr, sizeof(unit_addr), "%x", reg); + if (!streq(unitname, unit_addr)) + FAIL(c, dti, node, "SPI bus unit address format error, expected \"%s\"", + unit_addr); +} +WARNING(spi_bus_reg, check_spi_bus_reg, NULL, ®_format, &spi_bus_bridge); + static void check_unit_address_format(struct check *c, struct dt_info *dti, struct node *node) { @@ -1582,6 +1719,12 @@ static struct check *check_table[] = { &simple_bus_bridge, &simple_bus_reg, + &i2c_bus_bridge, + &i2c_bus_reg, + + &spi_bus_bridge, + &spi_bus_reg, + &avoid_default_addr_size, &avoid_unnecessary_addr_size, &unique_unit_address, diff --git a/scripts/dtc/data.c b/scripts/dtc/data.c index aa37a16c8891..4a204145cc7b 100644 --- a/scripts/dtc/data.c +++ b/scripts/dtc/data.c @@ -74,7 +74,8 @@ struct data data_copy_escape_string(const char *s, int len) struct data d; char *q; - d = data_grow_for(empty_data, len + 1); + d = data_add_marker(empty_data, TYPE_STRING, NULL); + d = data_grow_for(d, len + 1); q = d.val; while (i < len) { @@ -94,6 +95,7 @@ struct data data_copy_file(FILE *f, size_t maxlen) { struct data d = empty_data; + d = data_add_marker(d, TYPE_NONE, NULL); while (!feof(f) && (d.len < maxlen)) { size_t chunksize, ret; diff --git a/scripts/dtc/dtc-parser.y b/scripts/dtc/dtc-parser.y index 011a5b25539a..dd70ebf386f4 100644 --- a/scripts/dtc/dtc-parser.y +++ b/scripts/dtc/dtc-parser.y @@ -287,6 +287,7 @@ propdata: } | propdataprefix DT_REF { + $1 = data_add_marker($1, TYPE_STRING, $2); $$ = data_add_marker($1, REF_PATH, $2); } | propdataprefix DT_INCBIN '(' DT_STRING ',' integer_prim ',' integer_prim ')' @@ -340,22 +341,27 @@ arrayprefix: DT_BITS DT_LITERAL '<' { unsigned long long bits; + enum markertype type = TYPE_UINT32; bits = $2; - if ((bits != 8) && (bits != 16) && - (bits != 32) && (bits != 64)) { + switch (bits) { + case 8: type = TYPE_UINT8; break; + case 16: type = TYPE_UINT16; break; + case 32: type = TYPE_UINT32; break; + case 64: type = TYPE_UINT64; break; + default: ERROR(&@2, "Array elements must be" " 8, 16, 32 or 64-bits"); bits = 32; } - $$.data = empty_data; + $$.data = data_add_marker(empty_data, type, NULL); $$.bits = bits; } | '<' { - $$.data = empty_data; + $$.data = data_add_marker(empty_data, TYPE_UINT32, NULL); $$.bits = 32; } | arrayprefix integer_prim @@ -499,7 +505,7 @@ integer_unary: bytestring: /* empty */ { - $$ = empty_data; + $$ = data_add_marker(empty_data, TYPE_UINT8, NULL); } | bytestring DT_BYTE { diff --git a/scripts/dtc/dtc.c b/scripts/dtc/dtc.c index c36994e6eac5..64134aadb997 100644 --- a/scripts/dtc/dtc.c +++ b/scripts/dtc/dtc.c @@ -95,6 +95,9 @@ static const char * const usage_opts_help[] = { "\n\tOutput formats are:\n" "\t\tdts - device tree source text\n" "\t\tdtb - device tree blob\n" +#ifndef NO_YAML + "\t\tyaml - device tree encoded as YAML\n" +#endif "\t\tasm - assembler source", "\n\tBlob version to produce, defaults to "stringify(DEFAULT_FDT_VERSION)" (for dtb and asm output)", "\n\tOutput dependency file", @@ -128,6 +131,8 @@ static const char *guess_type_by_name(const char *fname, const char *fallback) return fallback; if (!strcasecmp(s, ".dts")) return "dts"; + if (!strcasecmp(s, ".yaml")) + return "yaml"; if (!strcasecmp(s, ".dtb")) return "dtb"; return fallback; @@ -350,6 +355,12 @@ int main(int argc, char *argv[]) if (streq(outform, "dts")) { dt_to_source(outf, dti); +#ifndef NO_YAML + } else if (streq(outform, "yaml")) { + if (!streq(inform, "dts")) + die("YAML output format requires dts input format\n"); + dt_to_yaml(outf, dti); +#endif } else if (streq(outform, "dtb")) { dt_to_blob(outf, dti, outversion); } else if (streq(outform, "asm")) { diff --git a/scripts/dtc/dtc.h b/scripts/dtc/dtc.h index 6d667701ab6a..cbe541525c2c 100644 --- a/scripts/dtc/dtc.h +++ b/scripts/dtc/dtc.h @@ -74,10 +74,17 @@ typedef uint32_t cell_t; /* Data blobs */ enum markertype { + TYPE_NONE, REF_PHANDLE, REF_PATH, LABEL, + TYPE_UINT8, + TYPE_UINT16, + TYPE_UINT32, + TYPE_UINT64, + TYPE_STRING, }; +extern const char *markername(enum markertype markertype); struct marker { enum markertype type; @@ -101,6 +108,8 @@ struct data { for_each_marker(m) \ if ((m)->type == (t)) +size_t type_marker_length(struct marker *m); + void data_free(struct data d); struct data data_grow_for(struct data d, int xlen); @@ -290,6 +299,10 @@ struct dt_info *dt_from_blob(const char *fname); void dt_to_source(FILE *f, struct dt_info *dti); struct dt_info *dt_from_source(const char *f); +/* YAML source */ + +void dt_to_yaml(FILE *f, struct dt_info *dti); + /* FS trees */ struct dt_info *dt_from_fs(const char *dirname); diff --git a/scripts/dtc/flattree.c b/scripts/dtc/flattree.c index 8d268fb785db..851ea87dbc0f 100644 --- a/scripts/dtc/flattree.c +++ b/scripts/dtc/flattree.c @@ -393,7 +393,7 @@ void dt_to_blob(FILE *f, struct dt_info *dti, int version) padlen = 0; if (quiet < 1) fprintf(stderr, - "Warning: blob size %d >= minimum size %d\n", + "Warning: blob size %"PRIu32" >= minimum size %d\n", fdt32_to_cpu(fdt.totalsize), minsize); } } diff --git a/scripts/dtc/libfdt/fdt.c b/scripts/dtc/libfdt/fdt.c index 7855a1787763..ae03b1112961 100644 --- a/scripts/dtc/libfdt/fdt.c +++ b/scripts/dtc/libfdt/fdt.c @@ -55,7 +55,12 @@ #include "libfdt_internal.h" -int fdt_check_header(const void *fdt) +/* + * Minimal sanity check for a read-only tree. fdt_ro_probe_() checks + * that the given buffer contains what appears to be a flattened + * device tree with sane information in its header. + */ +int fdt_ro_probe_(const void *fdt) { if (fdt_magic(fdt) == FDT_MAGIC) { /* Complete tree */ @@ -74,6 +79,78 @@ int fdt_check_header(const void *fdt) return 0; } +static int check_off_(uint32_t hdrsize, uint32_t totalsize, uint32_t off) +{ + return (off >= hdrsize) && (off <= totalsize); +} + +static int check_block_(uint32_t hdrsize, uint32_t totalsize, + uint32_t base, uint32_t size) +{ + if (!check_off_(hdrsize, totalsize, base)) + return 0; /* block start out of bounds */ + if ((base + size) < base) + return 0; /* overflow */ + if (!check_off_(hdrsize, totalsize, base + size)) + return 0; /* block end out of bounds */ + return 1; +} + +size_t fdt_header_size_(uint32_t version) +{ + if (version <= 1) + return FDT_V1_SIZE; + else if (version <= 2) + return FDT_V2_SIZE; + else if (version <= 3) + return FDT_V3_SIZE; + else if (version <= 16) + return FDT_V16_SIZE; + else + return FDT_V17_SIZE; +} + +int fdt_check_header(const void *fdt) +{ + size_t hdrsize; + + if (fdt_magic(fdt) != FDT_MAGIC) + return -FDT_ERR_BADMAGIC; + hdrsize = fdt_header_size(fdt); + if ((fdt_version(fdt) < FDT_FIRST_SUPPORTED_VERSION) + || (fdt_last_comp_version(fdt) > FDT_LAST_SUPPORTED_VERSION)) + return -FDT_ERR_BADVERSION; + if (fdt_version(fdt) < fdt_last_comp_version(fdt)) + return -FDT_ERR_BADVERSION; + + if ((fdt_totalsize(fdt) < hdrsize) + || (fdt_totalsize(fdt) > INT_MAX)) + return -FDT_ERR_TRUNCATED; + + /* Bounds check memrsv block */ + if (!check_off_(hdrsize, fdt_totalsize(fdt), fdt_off_mem_rsvmap(fdt))) + return -FDT_ERR_TRUNCATED; + + /* Bounds check structure block */ + if (fdt_version(fdt) < 17) { + if (!check_off_(hdrsize, fdt_totalsize(fdt), + fdt_off_dt_struct(fdt))) + return -FDT_ERR_TRUNCATED; + } else { + if (!check_block_(hdrsize, fdt_totalsize(fdt), + fdt_off_dt_struct(fdt), + fdt_size_dt_struct(fdt))) + return -FDT_ERR_TRUNCATED; + } + + /* Bounds check strings block */ + if (!check_block_(hdrsize, fdt_totalsize(fdt), + fdt_off_dt_strings(fdt), fdt_size_dt_strings(fdt))) + return -FDT_ERR_TRUNCATED; + + return 0; +} + const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int len) { unsigned absoffset = offset + fdt_off_dt_struct(fdt); @@ -244,7 +321,7 @@ const char *fdt_find_string_(const char *strtab, int tabsize, const char *s) int fdt_move(const void *fdt, void *buf, int bufsize) { - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); if (fdt_totalsize(fdt) > bufsize) return -FDT_ERR_NOSPACE; diff --git a/scripts/dtc/libfdt/fdt_addresses.c b/scripts/dtc/libfdt/fdt_addresses.c index eff4dbcc729d..49537b578d03 100644 --- a/scripts/dtc/libfdt/fdt_addresses.c +++ b/scripts/dtc/libfdt/fdt_addresses.c @@ -1,6 +1,7 @@ /* * libfdt - Flat Device Tree manipulation * Copyright (C) 2014 David Gibson + * Copyright (C) 2018 embedded brains GmbH * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. @@ -55,42 +56,32 @@ #include "libfdt_internal.h" -int fdt_address_cells(const void *fdt, int nodeoffset) +static int fdt_cells(const void *fdt, int nodeoffset, const char *name) { - const fdt32_t *ac; + const fdt32_t *c; int val; int len; - ac = fdt_getprop(fdt, nodeoffset, "#address-cells", &len); - if (!ac) + c = fdt_getprop(fdt, nodeoffset, name, &len); + if (!c) return 2; - if (len != sizeof(*ac)) + if (len != sizeof(*c)) return -FDT_ERR_BADNCELLS; - val = fdt32_to_cpu(*ac); + val = fdt32_to_cpu(*c); if ((val <= 0) || (val > FDT_MAX_NCELLS)) return -FDT_ERR_BADNCELLS; return val; } -int fdt_size_cells(const void *fdt, int nodeoffset) +int fdt_address_cells(const void *fdt, int nodeoffset) { - const fdt32_t *sc; - int val; - int len; - - sc = fdt_getprop(fdt, nodeoffset, "#size-cells", &len); - if (!sc) - return 2; - - if (len != sizeof(*sc)) - return -FDT_ERR_BADNCELLS; - - val = fdt32_to_cpu(*sc); - if ((val < 0) || (val > FDT_MAX_NCELLS)) - return -FDT_ERR_BADNCELLS; + return fdt_cells(fdt, nodeoffset, "#address-cells"); +} - return val; +int fdt_size_cells(const void *fdt, int nodeoffset) +{ + return fdt_cells(fdt, nodeoffset, "#size-cells"); } diff --git a/scripts/dtc/libfdt/fdt_overlay.c b/scripts/dtc/libfdt/fdt_overlay.c index bf75388ec9a2..5fdab6c6371d 100644 --- a/scripts/dtc/libfdt/fdt_overlay.c +++ b/scripts/dtc/libfdt/fdt_overlay.c @@ -697,7 +697,7 @@ static int get_path_len(const void *fdt, int nodeoffset) int len = 0, namelen; const char *name; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); for (;;) { name = fdt_get_name(fdt, nodeoffset, &namelen); @@ -866,8 +866,8 @@ int fdt_overlay_apply(void *fdt, void *fdto) uint32_t delta = fdt_get_max_phandle(fdt); int ret; - FDT_CHECK_HEADER(fdt); - FDT_CHECK_HEADER(fdto); + FDT_RO_PROBE(fdt); + FDT_RO_PROBE(fdto); ret = overlay_adjust_local_phandles(fdto, delta); if (ret) diff --git a/scripts/dtc/libfdt/fdt_ro.c b/scripts/dtc/libfdt/fdt_ro.c index dfb3236da388..eafc14282892 100644 --- a/scripts/dtc/libfdt/fdt_ro.c +++ b/scripts/dtc/libfdt/fdt_ro.c @@ -76,17 +76,72 @@ static int fdt_nodename_eq_(const void *fdt, int offset, return 0; } +const char *fdt_get_string(const void *fdt, int stroffset, int *lenp) +{ + uint32_t absoffset = stroffset + fdt_off_dt_strings(fdt); + size_t len; + int err; + const char *s, *n; + + err = fdt_ro_probe_(fdt); + if (err != 0) + goto fail; + + err = -FDT_ERR_BADOFFSET; + if (absoffset >= fdt_totalsize(fdt)) + goto fail; + len = fdt_totalsize(fdt) - absoffset; + + if (fdt_magic(fdt) == FDT_MAGIC) { + if (stroffset < 0) + goto fail; + if (fdt_version(fdt) >= 17) { + if (stroffset >= fdt_size_dt_strings(fdt)) + goto fail; + if ((fdt_size_dt_strings(fdt) - stroffset) < len) + len = fdt_size_dt_strings(fdt) - stroffset; + } + } else if (fdt_magic(fdt) == FDT_SW_MAGIC) { + if ((stroffset >= 0) + || (stroffset < -fdt_size_dt_strings(fdt))) + goto fail; + if ((-stroffset) < len) + len = -stroffset; + } else { + err = -FDT_ERR_INTERNAL; + goto fail; + } + + s = (const char *)fdt + absoffset; + n = memchr(s, '\0', len); + if (!n) { + /* missing terminating NULL */ + err = -FDT_ERR_TRUNCATED; + goto fail; + } + + if (lenp) + *lenp = n - s; + return s; + +fail: + if (lenp) + *lenp = err; + return NULL; +} + const char *fdt_string(const void *fdt, int stroffset) { - return (const char *)fdt + fdt_off_dt_strings(fdt) + stroffset; + return fdt_get_string(fdt, stroffset, NULL); } static int fdt_string_eq_(const void *fdt, int stroffset, const char *s, int len) { - const char *p = fdt_string(fdt, stroffset); + int slen; + const char *p = fdt_get_string(fdt, stroffset, &slen); - return (strlen(p) == len) && (memcmp(p, s, len) == 0); + return p && (slen == len) && (memcmp(p, s, len) == 0); } uint32_t fdt_get_max_phandle(const void *fdt) @@ -115,21 +170,42 @@ uint32_t fdt_get_max_phandle(const void *fdt) return 0; } +static const struct fdt_reserve_entry *fdt_mem_rsv(const void *fdt, int n) +{ + int offset = n * sizeof(struct fdt_reserve_entry); + int absoffset = fdt_off_mem_rsvmap(fdt) + offset; + + if (absoffset < fdt_off_mem_rsvmap(fdt)) + return NULL; + if (absoffset > fdt_totalsize(fdt) - sizeof(struct fdt_reserve_entry)) + return NULL; + return fdt_mem_rsv_(fdt, n); +} + int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size) { - FDT_CHECK_HEADER(fdt); - *address = fdt64_to_cpu(fdt_mem_rsv_(fdt, n)->address); - *size = fdt64_to_cpu(fdt_mem_rsv_(fdt, n)->size); + const struct fdt_reserve_entry *re; + + FDT_RO_PROBE(fdt); + re = fdt_mem_rsv(fdt, n); + if (!re) + return -FDT_ERR_BADOFFSET; + + *address = fdt64_ld(&re->address); + *size = fdt64_ld(&re->size); return 0; } int fdt_num_mem_rsv(const void *fdt) { - int i = 0; + int i; + const struct fdt_reserve_entry *re; - while (fdt64_to_cpu(fdt_mem_rsv_(fdt, i)->size) != 0) - i++; - return i; + for (i = 0; (re = fdt_mem_rsv(fdt, i)) != NULL; i++) { + if (fdt64_ld(&re->size) == 0) + return i; + } + return -FDT_ERR_TRUNCATED; } static int nextprop_(const void *fdt, int offset) @@ -161,7 +237,7 @@ int fdt_subnode_offset_namelen(const void *fdt, int offset, { int depth; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); for (depth = 0; (offset >= 0) && (depth >= 0); @@ -187,7 +263,7 @@ int fdt_path_offset_namelen(const void *fdt, const char *path, int namelen) const char *p = path; int offset = 0; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); /* see if we have an alias */ if (*path != '/') { @@ -237,7 +313,7 @@ const char *fdt_get_name(const void *fdt, int nodeoffset, int *len) const char *nameptr; int err; - if (((err = fdt_check_header(fdt)) != 0) + if (((err = fdt_ro_probe_(fdt)) != 0) || ((err = fdt_check_node_offset_(fdt, nodeoffset)) < 0)) goto fail; @@ -303,7 +379,7 @@ static const struct fdt_property *fdt_get_property_by_offset_(const void *fdt, prop = fdt_offset_ptr_(fdt, offset); if (lenp) - *lenp = fdt32_to_cpu(prop->len); + *lenp = fdt32_ld(&prop->len); return prop; } @@ -340,7 +416,7 @@ static const struct fdt_property *fdt_get_property_namelen_(const void *fdt, offset = -FDT_ERR_INTERNAL; break; } - if (fdt_string_eq_(fdt, fdt32_to_cpu(prop->nameoff), + if (fdt_string_eq_(fdt, fdt32_ld(&prop->nameoff), name, namelen)) { if (poffset) *poffset = offset; @@ -393,7 +469,7 @@ const void *fdt_getprop_namelen(const void *fdt, int nodeoffset, /* Handle realignment */ if (fdt_version(fdt) < 0x10 && (poffset + sizeof(*prop)) % 8 && - fdt32_to_cpu(prop->len) >= 8) + fdt32_ld(&prop->len) >= 8) return prop->data + 4; return prop->data; } @@ -406,12 +482,22 @@ const void *fdt_getprop_by_offset(const void *fdt, int offset, prop = fdt_get_property_by_offset_(fdt, offset, lenp); if (!prop) return NULL; - if (namep) - *namep = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + if (namep) { + const char *name; + int namelen; + name = fdt_get_string(fdt, fdt32_ld(&prop->nameoff), + &namelen); + if (!name) { + if (lenp) + *lenp = namelen; + return NULL; + } + *namep = name; + } /* Handle realignment */ if (fdt_version(fdt) < 0x10 && (offset + sizeof(*prop)) % 8 && - fdt32_to_cpu(prop->len) >= 8) + fdt32_ld(&prop->len) >= 8) return prop->data + 4; return prop->data; } @@ -436,7 +522,7 @@ uint32_t fdt_get_phandle(const void *fdt, int nodeoffset) return 0; } - return fdt32_to_cpu(*php); + return fdt32_ld(php); } const char *fdt_get_alias_namelen(const void *fdt, @@ -462,7 +548,7 @@ int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen) int offset, depth, namelen; const char *name; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); if (buflen < 2) return -FDT_ERR_NOSPACE; @@ -514,7 +600,7 @@ int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset, int offset, depth; int supernodeoffset = -FDT_ERR_INTERNAL; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); if (supernodedepth < 0) return -FDT_ERR_NOTFOUND; @@ -573,7 +659,7 @@ int fdt_node_offset_by_prop_value(const void *fdt, int startoffset, const void *val; int len; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); /* FIXME: The algorithm here is pretty horrible: we scan each * property of a node in fdt_getprop(), then if that didn't @@ -599,7 +685,7 @@ int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle) if ((phandle == 0) || (phandle == -1)) return -FDT_ERR_BADPHANDLE; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); /* FIXME: The algorithm here is pretty horrible: we * potentially scan each property of a node in @@ -752,7 +838,7 @@ int fdt_node_offset_by_compatible(const void *fdt, int startoffset, { int offset, err; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); /* FIXME: The algorithm here is pretty horrible: we scan each * property of a node in fdt_node_check_compatible(), then if @@ -771,3 +857,66 @@ int fdt_node_offset_by_compatible(const void *fdt, int startoffset, return offset; /* error from fdt_next_node() */ } + +int fdt_check_full(const void *fdt, size_t bufsize) +{ + int err; + int num_memrsv; + int offset, nextoffset = 0; + uint32_t tag; + unsigned depth = 0; + const void *prop; + const char *propname; + + if (bufsize < FDT_V1_SIZE) + return -FDT_ERR_TRUNCATED; + err = fdt_check_header(fdt); + if (err != 0) + return err; + if (bufsize < fdt_totalsize(fdt)) + return -FDT_ERR_TRUNCATED; + + num_memrsv = fdt_num_mem_rsv(fdt); + if (num_memrsv < 0) + return num_memrsv; + + while (1) { + offset = nextoffset; + tag = fdt_next_tag(fdt, offset, &nextoffset); + + if (nextoffset < 0) + return nextoffset; + + switch (tag) { + case FDT_NOP: + break; + + case FDT_END: + if (depth != 0) + return -FDT_ERR_BADSTRUCTURE; + return 0; + + case FDT_BEGIN_NODE: + depth++; + if (depth > INT_MAX) + return -FDT_ERR_BADSTRUCTURE; + break; + + case FDT_END_NODE: + if (depth == 0) + return -FDT_ERR_BADSTRUCTURE; + depth--; + break; + + case FDT_PROP: + prop = fdt_getprop_by_offset(fdt, offset, &propname, + &err); + if (!prop) + return err; + break; + + default: + return -FDT_ERR_INTERNAL; + } + } +} diff --git a/scripts/dtc/libfdt/fdt_rw.c b/scripts/dtc/libfdt/fdt_rw.c index 9b829051e444..2e49855d7cf8 100644 --- a/scripts/dtc/libfdt/fdt_rw.c +++ b/scripts/dtc/libfdt/fdt_rw.c @@ -67,9 +67,9 @@ static int fdt_blocks_misordered_(const void *fdt, (fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt))); } -static int fdt_rw_check_header_(void *fdt) +static int fdt_rw_probe_(void *fdt) { - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); if (fdt_version(fdt) < 17) return -FDT_ERR_BADVERSION; @@ -82,10 +82,10 @@ static int fdt_rw_check_header_(void *fdt) return 0; } -#define FDT_RW_CHECK_HEADER(fdt) \ +#define FDT_RW_PROBE(fdt) \ { \ int err_; \ - if ((err_ = fdt_rw_check_header_(fdt)) != 0) \ + if ((err_ = fdt_rw_probe_(fdt)) != 0) \ return err_; \ } @@ -176,7 +176,7 @@ int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size) struct fdt_reserve_entry *re; int err; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); re = fdt_mem_rsv_w_(fdt, fdt_num_mem_rsv(fdt)); err = fdt_splice_mem_rsv_(fdt, re, 0, 1); @@ -192,7 +192,7 @@ int fdt_del_mem_rsv(void *fdt, int n) { struct fdt_reserve_entry *re = fdt_mem_rsv_w_(fdt, n); - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); if (n >= fdt_num_mem_rsv(fdt)) return -FDT_ERR_NOTFOUND; @@ -252,7 +252,7 @@ int fdt_set_name(void *fdt, int nodeoffset, const char *name) int oldlen, newlen; int err; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); namep = (char *)(uintptr_t)fdt_get_name(fdt, nodeoffset, &oldlen); if (!namep) @@ -275,7 +275,7 @@ int fdt_setprop_placeholder(void *fdt, int nodeoffset, const char *name, struct fdt_property *prop; int err; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); err = fdt_resize_property_(fdt, nodeoffset, name, len, &prop); if (err == -FDT_ERR_NOTFOUND) @@ -308,7 +308,7 @@ int fdt_appendprop(void *fdt, int nodeoffset, const char *name, struct fdt_property *prop; int err, oldlen, newlen; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); prop = fdt_get_property_w(fdt, nodeoffset, name, &oldlen); if (prop) { @@ -334,7 +334,7 @@ int fdt_delprop(void *fdt, int nodeoffset, const char *name) struct fdt_property *prop; int len, proplen; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); prop = fdt_get_property_w(fdt, nodeoffset, name, &len); if (!prop) @@ -354,7 +354,7 @@ int fdt_add_subnode_namelen(void *fdt, int parentoffset, uint32_t tag; fdt32_t *endtag; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); offset = fdt_subnode_offset_namelen(fdt, parentoffset, name, namelen); if (offset >= 0) @@ -394,7 +394,7 @@ int fdt_del_node(void *fdt, int nodeoffset) { int endoffset; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); endoffset = fdt_node_end_offset_(fdt, nodeoffset); if (endoffset < 0) @@ -435,7 +435,7 @@ int fdt_open_into(const void *fdt, void *buf, int bufsize) const char *fdtend = fdtstart + fdt_totalsize(fdt); char *tmp; - FDT_CHECK_HEADER(fdt); + FDT_RO_PROBE(fdt); mem_rsv_size = (fdt_num_mem_rsv(fdt)+1) * sizeof(struct fdt_reserve_entry); @@ -494,7 +494,7 @@ int fdt_pack(void *fdt) { int mem_rsv_size; - FDT_RW_CHECK_HEADER(fdt); + FDT_RW_PROBE(fdt); mem_rsv_size = (fdt_num_mem_rsv(fdt)+1) * sizeof(struct fdt_reserve_entry); diff --git a/scripts/dtc/libfdt/fdt_sw.c b/scripts/dtc/libfdt/fdt_sw.c index 6d33cc29d022..9fa4a94d83c3 100644 --- a/scripts/dtc/libfdt/fdt_sw.c +++ b/scripts/dtc/libfdt/fdt_sw.c @@ -55,21 +55,77 @@ #include "libfdt_internal.h" -static int fdt_sw_check_header_(void *fdt) +static int fdt_sw_probe_(void *fdt) { - if (fdt_magic(fdt) != FDT_SW_MAGIC) + if (fdt_magic(fdt) == FDT_MAGIC) + return -FDT_ERR_BADSTATE; + else if (fdt_magic(fdt) != FDT_SW_MAGIC) return -FDT_ERR_BADMAGIC; - /* FIXME: should check more details about the header state */ return 0; } -#define FDT_SW_CHECK_HEADER(fdt) \ +#define FDT_SW_PROBE(fdt) \ + { \ + int err; \ + if ((err = fdt_sw_probe_(fdt)) != 0) \ + return err; \ + } + +/* 'memrsv' state: Initial state after fdt_create() + * + * Allowed functions: + * fdt_add_reservmap_entry() + * fdt_finish_reservemap() [moves to 'struct' state] + */ +static int fdt_sw_probe_memrsv_(void *fdt) +{ + int err = fdt_sw_probe_(fdt); + if (err) + return err; + + if (fdt_off_dt_strings(fdt) != 0) + return -FDT_ERR_BADSTATE; + return 0; +} + +#define FDT_SW_PROBE_MEMRSV(fdt) \ + { \ + int err; \ + if ((err = fdt_sw_probe_memrsv_(fdt)) != 0) \ + return err; \ + } + +/* 'struct' state: Enter this state after fdt_finish_reservemap() + * + * Allowed functions: + * fdt_begin_node() + * fdt_end_node() + * fdt_property*() + * fdt_finish() [moves to 'complete' state] + */ +static int fdt_sw_probe_struct_(void *fdt) +{ + int err = fdt_sw_probe_(fdt); + if (err) + return err; + + if (fdt_off_dt_strings(fdt) != fdt_totalsize(fdt)) + return -FDT_ERR_BADSTATE; + return 0; +} + +#define FDT_SW_PROBE_STRUCT(fdt) \ { \ int err; \ - if ((err = fdt_sw_check_header_(fdt)) != 0) \ + if ((err = fdt_sw_probe_struct_(fdt)) != 0) \ return err; \ } +/* 'complete' state: Enter this state after fdt_finish() + * + * Allowed functions: none + */ + static void *fdt_grab_space_(void *fdt, size_t len) { int offset = fdt_size_dt_struct(fdt); @@ -87,9 +143,11 @@ static void *fdt_grab_space_(void *fdt, size_t len) int fdt_create(void *buf, int bufsize) { + const size_t hdrsize = FDT_ALIGN(sizeof(struct fdt_header), + sizeof(struct fdt_reserve_entry)); void *fdt = buf; - if (bufsize < sizeof(struct fdt_header)) + if (bufsize < hdrsize) return -FDT_ERR_NOSPACE; memset(buf, 0, bufsize); @@ -99,10 +157,9 @@ int fdt_create(void *buf, int bufsize) fdt_set_last_comp_version(fdt, FDT_FIRST_SUPPORTED_VERSION); fdt_set_totalsize(fdt, bufsize); - fdt_set_off_mem_rsvmap(fdt, FDT_ALIGN(sizeof(struct fdt_header), - sizeof(struct fdt_reserve_entry))); + fdt_set_off_mem_rsvmap(fdt, hdrsize); fdt_set_off_dt_struct(fdt, fdt_off_mem_rsvmap(fdt)); - fdt_set_off_dt_strings(fdt, bufsize); + fdt_set_off_dt_strings(fdt, 0); return 0; } @@ -112,11 +169,14 @@ int fdt_resize(void *fdt, void *buf, int bufsize) size_t headsize, tailsize; char *oldtail, *newtail; - FDT_SW_CHECK_HEADER(fdt); + FDT_SW_PROBE(fdt); - headsize = fdt_off_dt_struct(fdt); + headsize = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt); tailsize = fdt_size_dt_strings(fdt); + if ((headsize + tailsize) > fdt_totalsize(fdt)) + return -FDT_ERR_INTERNAL; + if ((headsize + tailsize) > bufsize) return -FDT_ERR_NOSPACE; @@ -133,8 +193,9 @@ int fdt_resize(void *fdt, void *buf, int bufsize) memmove(buf, fdt, headsize); } - fdt_set_off_dt_strings(buf, bufsize); fdt_set_totalsize(buf, bufsize); + if (fdt_off_dt_strings(buf)) + fdt_set_off_dt_strings(buf, bufsize); return 0; } @@ -144,10 +205,7 @@ int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size) struct fdt_reserve_entry *re; int offset; - FDT_SW_CHECK_HEADER(fdt); - - if (fdt_size_dt_struct(fdt)) - return -FDT_ERR_BADSTATE; + FDT_SW_PROBE_MEMRSV(fdt); offset = fdt_off_dt_struct(fdt); if ((offset + sizeof(*re)) > fdt_totalsize(fdt)) @@ -164,16 +222,23 @@ int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size) int fdt_finish_reservemap(void *fdt) { - return fdt_add_reservemap_entry(fdt, 0, 0); + int err = fdt_add_reservemap_entry(fdt, 0, 0); + + if (err) + return err; + + fdt_set_off_dt_strings(fdt, fdt_totalsize(fdt)); + return 0; } int fdt_begin_node(void *fdt, const char *name) { struct fdt_node_header *nh; - int namelen = strlen(name) + 1; + int namelen; - FDT_SW_CHECK_HEADER(fdt); + FDT_SW_PROBE_STRUCT(fdt); + namelen = strlen(name) + 1; nh = fdt_grab_space_(fdt, sizeof(*nh) + FDT_TAGALIGN(namelen)); if (! nh) return -FDT_ERR_NOSPACE; @@ -187,7 +252,7 @@ int fdt_end_node(void *fdt) { fdt32_t *en; - FDT_SW_CHECK_HEADER(fdt); + FDT_SW_PROBE_STRUCT(fdt); en = fdt_grab_space_(fdt, FDT_TAGSIZE); if (! en) @@ -225,7 +290,7 @@ int fdt_property_placeholder(void *fdt, const char *name, int len, void **valp) struct fdt_property *prop; int nameoff; - FDT_SW_CHECK_HEADER(fdt); + FDT_SW_PROBE_STRUCT(fdt); nameoff = fdt_find_add_string_(fdt, name); if (nameoff == 0) @@ -262,7 +327,7 @@ int fdt_finish(void *fdt) uint32_t tag; int offset, nextoffset; - FDT_SW_CHECK_HEADER(fdt); + FDT_SW_PROBE_STRUCT(fdt); /* Add terminator */ end = fdt_grab_space_(fdt, sizeof(*end)); diff --git a/scripts/dtc/libfdt/libfdt.h b/scripts/dtc/libfdt/libfdt.h index 1e27780e1185..2bd151dd355f 100644 --- a/scripts/dtc/libfdt/libfdt.h +++ b/scripts/dtc/libfdt/libfdt.h @@ -90,8 +90,9 @@ /* Error codes: codes for bad device tree blobs */ #define FDT_ERR_TRUNCATED 8 - /* FDT_ERR_TRUNCATED: Structure block of the given device tree - * ends without an FDT_END tag. */ + /* FDT_ERR_TRUNCATED: FDT or a sub-block is improperly + * terminated (overflows, goes outside allowed bounds, or + * isn't properly terminated). */ #define FDT_ERR_BADMAGIC 9 /* FDT_ERR_BADMAGIC: Given "device tree" appears not to be a * device tree at all - it is missing the flattened device @@ -153,6 +154,29 @@ static inline void *fdt_offset_ptr_w(void *fdt, int offset, int checklen) uint32_t fdt_next_tag(const void *fdt, int offset, int *nextoffset); +/* + * Alignment helpers: + * These helpers access words from a device tree blob. They're + * built to work even with unaligned pointers on platforms (ike + * ARM) that don't like unaligned loads and stores + */ + +static inline uint32_t fdt32_ld(const fdt32_t *p) +{ + fdt32_t v; + + memcpy(&v, p, sizeof(v)); + return fdt32_to_cpu(v); +} + +static inline uint64_t fdt64_ld(const fdt64_t *p) +{ + fdt64_t v; + + memcpy(&v, p, sizeof(v)); + return fdt64_to_cpu(v); +} + /**********************************************************************/ /* Traversal functions */ /**********************************************************************/ @@ -213,7 +237,7 @@ int fdt_next_subnode(const void *fdt, int offset); /* General functions */ /**********************************************************************/ #define fdt_get_header(fdt, field) \ - (fdt32_to_cpu(((const struct fdt_header *)(fdt))->field)) + (fdt32_ld(&((const struct fdt_header *)(fdt))->field)) #define fdt_magic(fdt) (fdt_get_header(fdt, magic)) #define fdt_totalsize(fdt) (fdt_get_header(fdt, totalsize)) #define fdt_off_dt_struct(fdt) (fdt_get_header(fdt, off_dt_struct)) @@ -244,18 +268,31 @@ fdt_set_hdr_(size_dt_struct); #undef fdt_set_hdr_ /** - * fdt_check_header - sanity check a device tree or possible device tree + * fdt_header_size - return the size of the tree's header + * @fdt: pointer to a flattened device tree + */ +size_t fdt_header_size_(uint32_t version); +static inline size_t fdt_header_size(const void *fdt) +{ + return fdt_header_size_(fdt_version(fdt)); +} + +/** + * fdt_check_header - sanity check a device tree header + * @fdt: pointer to data which might be a flattened device tree * * fdt_check_header() checks that the given buffer contains what - * appears to be a flattened device tree with sane information in its - * header. + * appears to be a flattened device tree, and that the header contains + * valid information (to the extent that can be determined from the + * header alone). * * returns: * 0, if the buffer appears to contain a valid device tree * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, - * -FDT_ERR_BADSTATE, standard meanings, as above + * -FDT_ERR_BADSTATE, + * -FDT_ERR_TRUNCATED, standard meanings, as above */ int fdt_check_header(const void *fdt); @@ -284,6 +321,24 @@ int fdt_move(const void *fdt, void *buf, int bufsize); /* Read-only functions */ /**********************************************************************/ +int fdt_check_full(const void *fdt, size_t bufsize); + +/** + * fdt_get_string - retrieve a string from the strings block of a device tree + * @fdt: pointer to the device tree blob + * @stroffset: offset of the string within the strings block (native endian) + * @lenp: optional pointer to return the string's length + * + * fdt_get_string() retrieves a pointer to a single string from the + * strings block of the device tree blob at fdt, and optionally also + * returns the string's length in *lenp. + * + * returns: + * a pointer to the string, on success + * NULL, if stroffset is out of bounds, or doesn't point to a valid string + */ +const char *fdt_get_string(const void *fdt, int stroffset, int *lenp); + /** * fdt_string - retrieve a string from the strings block of a device tree * @fdt: pointer to the device tree blob @@ -294,7 +349,7 @@ int fdt_move(const void *fdt, void *buf, int bufsize); * * returns: * a pointer to the string, on success - * NULL, if stroffset is out of bounds + * NULL, if stroffset is out of bounds, or doesn't point to a valid string */ const char *fdt_string(const void *fdt, int stroffset); @@ -1090,7 +1145,7 @@ int fdt_address_cells(const void *fdt, int nodeoffset); * * returns: * 0 <= n < FDT_MAX_NCELLS, on success - * 2, if the node has no #address-cells property + * 2, if the node has no #size-cells property * -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid * #size-cells property * -FDT_ERR_BADMAGIC, @@ -1313,10 +1368,13 @@ static inline int fdt_property_u64(void *fdt, const char *name, uint64_t val) fdt64_t tmp = cpu_to_fdt64(val); return fdt_property(fdt, name, &tmp, sizeof(tmp)); } + +#ifndef SWIG /* Not available in Python */ static inline int fdt_property_cell(void *fdt, const char *name, uint32_t val) { return fdt_property_u32(fdt, name, val); } +#endif /** * fdt_property_placeholder - add a new property and return a ptr to its value diff --git a/scripts/dtc/libfdt/libfdt_env.h b/scripts/dtc/libfdt/libfdt_env.h index bd2474628775..eb2053845c9c 100644 --- a/scripts/dtc/libfdt/libfdt_env.h +++ b/scripts/dtc/libfdt/libfdt_env.h @@ -56,6 +56,7 @@ #include #include #include +#include #ifdef __CHECKER__ #define FDT_FORCE __attribute__((force)) diff --git a/scripts/dtc/libfdt/libfdt_internal.h b/scripts/dtc/libfdt/libfdt_internal.h index 7681e192295b..4109f890ae60 100644 --- a/scripts/dtc/libfdt/libfdt_internal.h +++ b/scripts/dtc/libfdt/libfdt_internal.h @@ -55,10 +55,11 @@ #define FDT_ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define FDT_TAGALIGN(x) (FDT_ALIGN((x), FDT_TAGSIZE)) -#define FDT_CHECK_HEADER(fdt) \ +int fdt_ro_probe_(const void *fdt); +#define FDT_RO_PROBE(fdt) \ { \ int err_; \ - if ((err_ = fdt_check_header(fdt)) != 0) \ + if ((err_ = fdt_ro_probe_(fdt)) != 0) \ return err_; \ } diff --git a/scripts/dtc/livetree.c b/scripts/dtc/livetree.c index 6e4c367f54b3..4ff0679e0062 100644 --- a/scripts/dtc/livetree.c +++ b/scripts/dtc/livetree.c @@ -594,6 +594,7 @@ struct node *get_node_by_ref(struct node *tree, const char *ref) cell_t get_node_phandle(struct node *root, struct node *node) { static cell_t phandle = 1; /* FIXME: ick, static local */ + struct data d = empty_data; if ((node->phandle != 0) && (node->phandle != -1)) return node->phandle; @@ -603,17 +604,16 @@ cell_t get_node_phandle(struct node *root, struct node *node) node->phandle = phandle; + d = data_add_marker(d, TYPE_UINT32, NULL); + d = data_append_cell(d, phandle); + if (!get_property(node, "linux,phandle") && (phandle_format & PHANDLE_LEGACY)) - add_property(node, - build_property("linux,phandle", - data_append_cell(empty_data, phandle))); + add_property(node, build_property("linux,phandle", d)); if (!get_property(node, "phandle") && (phandle_format & PHANDLE_EPAPR)) - add_property(node, - build_property("phandle", - data_append_cell(empty_data, phandle))); + add_property(node, build_property("phandle", d)); /* If the node *does* have a phandle property, we must * be dealing with a self-referencing phandle, which will be diff --git a/scripts/dtc/treesource.c b/scripts/dtc/treesource.c index 2461a3d068a0..f2874f1d1465 100644 --- a/scripts/dtc/treesource.c +++ b/scripts/dtc/treesource.c @@ -61,24 +61,14 @@ static bool isstring(char c) || strchr("\a\b\t\n\v\f\r", c)); } -static void write_propval_string(FILE *f, struct data val) +static void write_propval_string(FILE *f, const char *s, size_t len) { - const char *str = val.val; - int i; - struct marker *m = val.markers; - - assert(str[val.len-1] == '\0'); + const char *end = s + len - 1; + assert(*end == '\0'); - while (m && (m->offset == 0)) { - if (m->type == LABEL) - fprintf(f, "%s: ", m->ref); - m = m->next; - } fprintf(f, "\""); - - for (i = 0; i < (val.len-1); i++) { - char c = str[i]; - + while (s < end) { + char c = *s++; switch (c) { case '\a': fprintf(f, "\\a"); @@ -108,91 +98,78 @@ static void write_propval_string(FILE *f, struct data val) fprintf(f, "\\\""); break; case '\0': - fprintf(f, "\", "); - while (m && (m->offset <= (i + 1))) { - if (m->type == LABEL) { - assert(m->offset == (i+1)); - fprintf(f, "%s: ", m->ref); - } - m = m->next; - } - fprintf(f, "\""); + fprintf(f, "\\0"); break; default: if (isprint((unsigned char)c)) fprintf(f, "%c", c); else - fprintf(f, "\\x%02hhx", c); + fprintf(f, "\\x%02"PRIx8, c); } } fprintf(f, "\""); - - /* Wrap up any labels at the end of the value */ - for_each_marker_of_type(m, LABEL) { - assert (m->offset == val.len); - fprintf(f, " %s:", m->ref); - } } -static void write_propval_cells(FILE *f, struct data val) +static void write_propval_int(FILE *f, const char *p, size_t len, size_t width) { - void *propend = val.val + val.len; - fdt32_t *cp = (fdt32_t *)val.val; - struct marker *m = val.markers; - - fprintf(f, "<"); - for (;;) { - while (m && (m->offset <= ((char *)cp - val.val))) { - if (m->type == LABEL) { - assert(m->offset == ((char *)cp - val.val)); - fprintf(f, "%s: ", m->ref); - } - m = m->next; - } + const char *end = p + len; + assert(len % width == 0); - fprintf(f, "0x%x", fdt32_to_cpu(*cp++)); - if ((void *)cp >= propend) + for (; p < end; p += width) { + switch (width) { + case 1: + fprintf(f, " %02"PRIx8, *(const uint8_t*)p); + break; + case 2: + fprintf(f, " 0x%02"PRIx16, fdt16_to_cpu(*(const fdt16_t*)p)); + break; + case 4: + fprintf(f, " 0x%02"PRIx32, fdt32_to_cpu(*(const fdt32_t*)p)); + break; + case 8: + fprintf(f, " 0x%02"PRIx64, fdt64_to_cpu(*(const fdt64_t*)p)); break; - fprintf(f, " "); + } } +} - /* Wrap up any labels at the end of the value */ - for_each_marker_of_type(m, LABEL) { - assert (m->offset == val.len); - fprintf(f, " %s:", m->ref); - } - fprintf(f, ">"); +static bool has_data_type_information(struct marker *m) +{ + return m->type >= TYPE_UINT8; } -static void write_propval_bytes(FILE *f, struct data val) +static struct marker *next_type_marker(struct marker *m) { - void *propend = val.val + val.len; - const char *bp = val.val; - struct marker *m = val.markers; - - fprintf(f, "["); - for (;;) { - while (m && (m->offset == (bp-val.val))) { - if (m->type == LABEL) - fprintf(f, "%s: ", m->ref); - m = m->next; - } + while (m && !has_data_type_information(m)) + m = m->next; + return m; +} - fprintf(f, "%02hhx", (unsigned char)(*bp++)); - if ((const void *)bp >= propend) - break; - fprintf(f, " "); - } +size_t type_marker_length(struct marker *m) +{ + struct marker *next = next_type_marker(m->next); - /* Wrap up any labels at the end of the value */ - for_each_marker_of_type(m, LABEL) { - assert (m->offset == val.len); - fprintf(f, " %s:", m->ref); - } - fprintf(f, "]"); + if (next) + return next->offset - m->offset; + return 0; } -static void write_propval(FILE *f, struct property *prop) +static const char *delim_start[] = { + [TYPE_UINT8] = "[", + [TYPE_UINT16] = "/bits/ 16 <", + [TYPE_UINT32] = "<", + [TYPE_UINT64] = "/bits/ 64 <", + [TYPE_STRING] = "", +}; +static const char *delim_end[] = { + [TYPE_UINT8] = " ]", + [TYPE_UINT16] = " >", + [TYPE_UINT32] = " >", + [TYPE_UINT64] = " >", + [TYPE_STRING] = "", +}; + +static enum markertype guess_value_type(struct property *prop) { int len = prop->val.len; const char *p = prop->val.val; @@ -201,11 +178,6 @@ static void write_propval(FILE *f, struct property *prop) int nnotstringlbl = 0, nnotcelllbl = 0; int i; - if (len == 0) { - fprintf(f, ";\n"); - return; - } - for (i = 0; i < len; i++) { if (! isstring(p[i])) nnotstring++; @@ -220,17 +192,91 @@ static void write_propval(FILE *f, struct property *prop) nnotcelllbl++; } - fprintf(f, " = "); if ((p[len-1] == '\0') && (nnotstring == 0) && (nnul < (len-nnul)) && (nnotstringlbl == 0)) { - write_propval_string(f, prop->val); + return TYPE_STRING; } else if (((len % sizeof(cell_t)) == 0) && (nnotcelllbl == 0)) { - write_propval_cells(f, prop->val); - } else { - write_propval_bytes(f, prop->val); + return TYPE_UINT32; } - fprintf(f, ";\n"); + return TYPE_UINT8; +} + +static void write_propval(FILE *f, struct property *prop) +{ + size_t len = prop->val.len; + struct marker *m = prop->val.markers; + struct marker dummy_marker; + enum markertype emit_type = TYPE_NONE; + + if (len == 0) { + fprintf(f, ";\n"); + return; + } + + fprintf(f, " = "); + + if (!next_type_marker(m)) { + /* data type information missing, need to guess */ + dummy_marker.type = guess_value_type(prop); + dummy_marker.next = prop->val.markers; + dummy_marker.offset = 0; + dummy_marker.ref = NULL; + m = &dummy_marker; + } + + struct marker *m_label = prop->val.markers; + for_each_marker(m) { + size_t chunk_len; + const char *p = &prop->val.val[m->offset]; + + if (!has_data_type_information(m)) + continue; + + chunk_len = type_marker_length(m); + if (!chunk_len) + chunk_len = len - m->offset; + + if (emit_type != TYPE_NONE) + fprintf(f, "%s, ", delim_end[emit_type]); + emit_type = m->type; + + for_each_marker_of_type(m_label, LABEL) { + if (m_label->offset > m->offset) + break; + fprintf(f, "%s: ", m_label->ref); + } + + fprintf(f, "%s", delim_start[emit_type]); + + if (chunk_len <= 0) + continue; + + switch(emit_type) { + case TYPE_UINT16: + write_propval_int(f, p, chunk_len, 2); + break; + case TYPE_UINT32: + write_propval_int(f, p, chunk_len, 4); + break; + case TYPE_UINT64: + write_propval_int(f, p, chunk_len, 8); + break; + case TYPE_STRING: + write_propval_string(f, p, chunk_len); + break; + default: + write_propval_int(f, p, chunk_len, 1); + } + } + + /* Wrap up any labels at the end of the value */ + for_each_marker_of_type(m_label, LABEL) { + assert (m_label->offset == len); + fprintf(f, " %s:", m_label->ref); + } + + fprintf(f, "%s;\n", delim_end[emit_type] ? : ""); } static void write_tree_source_node(FILE *f, struct node *tree, int level) @@ -281,4 +327,3 @@ void dt_to_source(FILE *f, struct dt_info *dti) write_tree_source_node(f, dti->dt, 0); } - diff --git a/scripts/dtc/update-dtc-source.sh b/scripts/dtc/update-dtc-source.sh index 1a009fd195d0..7dd29a0362b8 100755 --- a/scripts/dtc/update-dtc-source.sh +++ b/scripts/dtc/update-dtc-source.sh @@ -32,7 +32,7 @@ DTC_UPSTREAM_PATH=`pwd`/../dtc DTC_LINUX_PATH=`pwd`/scripts/dtc DTC_SOURCE="checks.c data.c dtc.c dtc.h flattree.c fstree.c livetree.c srcpos.c \ - srcpos.h treesource.c util.c util.h version_gen.h Makefile.dtc \ + srcpos.h treesource.c util.c util.h version_gen.h yamltree.c Makefile.dtc \ dtc-lexer.l dtc-parser.y" LIBFDT_SOURCE="Makefile.libfdt fdt.c fdt.h fdt_addresses.c fdt_empty_tree.c \ fdt_overlay.c fdt_ro.c fdt_rw.c fdt_strerror.c fdt_sw.c \ diff --git a/scripts/dtc/util.c b/scripts/dtc/util.c index 9953c32a0244..a69b7a13463d 100644 --- a/scripts/dtc/util.c +++ b/scripts/dtc/util.c @@ -227,11 +227,11 @@ char get_escape_char(const char *s, int *i) return val; } -int utilfdt_read_err_len(const char *filename, char **buffp, off_t *len) +int utilfdt_read_err(const char *filename, char **buffp, size_t *len) { int fd = 0; /* assume stdin */ char *buf = NULL; - off_t bufsize = 1024, offset = 0; + size_t bufsize = 1024, offset = 0; int ret = 0; *buffp = NULL; @@ -264,20 +264,15 @@ int utilfdt_read_err_len(const char *filename, char **buffp, off_t *len) free(buf); else *buffp = buf; - *len = bufsize; + if (len) + *len = bufsize; return ret; } -int utilfdt_read_err(const char *filename, char **buffp) -{ - off_t len; - return utilfdt_read_err_len(filename, buffp, &len); -} - -char *utilfdt_read_len(const char *filename, off_t *len) +char *utilfdt_read(const char *filename, size_t *len) { char *buff; - int ret = utilfdt_read_err_len(filename, &buff, len); + int ret = utilfdt_read_err(filename, &buff, len); if (ret) { fprintf(stderr, "Couldn't open blob from '%s': %s\n", filename, @@ -288,12 +283,6 @@ char *utilfdt_read_len(const char *filename, off_t *len) return buff; } -char *utilfdt_read(const char *filename) -{ - off_t len; - return utilfdt_read_len(filename, &len); -} - int utilfdt_write_err(const char *filename, const void *blob) { int fd = 1; /* assume stdout */ diff --git a/scripts/dtc/util.h b/scripts/dtc/util.h index 66fba8ea709b..f6cea8274174 100644 --- a/scripts/dtc/util.h +++ b/scripts/dtc/util.h @@ -98,16 +98,10 @@ char get_escape_char(const char *s, int *i); * stderr. * * @param filename The filename to read, or - for stdin - * @return Pointer to allocated buffer containing fdt, or NULL on error - */ -char *utilfdt_read(const char *filename); - -/** - * Like utilfdt_read(), but also passes back the size of the file read. - * * @param len If non-NULL, the amount of data we managed to read + * @return Pointer to allocated buffer containing fdt, or NULL on error */ -char *utilfdt_read_len(const char *filename, off_t *len); +char *utilfdt_read(const char *filename, size_t *len); /** * Read a device tree file into a buffer. Does not report errors, but only @@ -116,16 +110,10 @@ char *utilfdt_read_len(const char *filename, off_t *len); * * @param filename The filename to read, or - for stdin * @param buffp Returns pointer to buffer containing fdt - * @return 0 if ok, else an errno value representing the error - */ -int utilfdt_read_err(const char *filename, char **buffp); - -/** - * Like utilfdt_read_err(), but also passes back the size of the file read. - * * @param len If non-NULL, the amount of data we managed to read + * @return 0 if ok, else an errno value representing the error */ -int utilfdt_read_err_len(const char *filename, char **buffp, off_t *len); +int utilfdt_read_err(const char *filename, char **buffp, size_t *len); /** * Write a device tree buffer to a file. This will report any errors on diff --git a/scripts/dtc/version_gen.h b/scripts/dtc/version_gen.h index b00f14ff7a17..6d23fd095f16 100644 --- a/scripts/dtc/version_gen.h +++ b/scripts/dtc/version_gen.h @@ -1 +1 @@ -#define DTC_VERSION "DTC 1.4.6-g84e414b0" +#define DTC_VERSION "DTC 1.4.7-gc86da84d" diff --git a/scripts/dtc/yamltree.c b/scripts/dtc/yamltree.c new file mode 100644 index 000000000000..a00285a5a9ec --- /dev/null +++ b/scripts/dtc/yamltree.c @@ -0,0 +1,247 @@ +/* + * (C) Copyright Linaro, Ltd. 2018 + * (C) Copyright Arm Holdings. 2017 + * (C) Copyright David Gibson , IBM Corporation. 2005. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + +#include +#include +#include "dtc.h" +#include "srcpos.h" + +char *yaml_error_name[] = { + [YAML_NO_ERROR] = "no error", + [YAML_MEMORY_ERROR] = "memory error", + [YAML_READER_ERROR] = "reader error", + [YAML_SCANNER_ERROR] = "scanner error", + [YAML_PARSER_ERROR] = "parser error", + [YAML_COMPOSER_ERROR] = "composer error", + [YAML_WRITER_ERROR] = "writer error", + [YAML_EMITTER_ERROR] = "emitter error", +}; + +#define yaml_emitter_emit_or_die(emitter, event) ( \ +{ \ + if (!yaml_emitter_emit(emitter, event)) \ + die("yaml '%s': %s in %s, line %i\n", \ + yaml_error_name[(emitter)->error], \ + (emitter)->problem, __func__, __LINE__); \ +}) + +static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers, char *data, int len, int width) +{ + yaml_event_t event; + void *tag; + int off, start_offset = markers->offset; + + switch(width) { + case 1: tag = "!u8"; break; + case 2: tag = "!u16"; break; + case 4: tag = "!u32"; break; + case 8: tag = "!u64"; break; + default: + die("Invalid width %i", width); + } + assert(len % width == 0); + + yaml_sequence_start_event_initialize(&event, NULL, + (yaml_char_t *)tag, width == 4, YAML_FLOW_SEQUENCE_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + + for (off = 0; off < len; off += width) { + char buf[32]; + struct marker *m; + bool is_phandle = false; + + switch(width) { + case 1: + sprintf(buf, "0x%"PRIx8, *(uint8_t*)(data + off)); + break; + case 2: + sprintf(buf, "0x%"PRIx16, fdt16_to_cpu(*(fdt16_t*)(data + off))); + break; + case 4: + sprintf(buf, "0x%"PRIx32, fdt32_to_cpu(*(fdt32_t*)(data + off))); + m = markers; + is_phandle = false; + for_each_marker_of_type(m, REF_PHANDLE) { + if (m->offset == (start_offset + off)) { + is_phandle = true; + break; + } + } + break; + case 8: + sprintf(buf, "0x%"PRIx64, fdt64_to_cpu(*(fdt64_t*)(data + off))); + break; + } + + if (is_phandle) + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t*)"!phandle", (yaml_char_t *)buf, + strlen(buf), 0, 0, YAML_PLAIN_SCALAR_STYLE); + else + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t*)YAML_INT_TAG, (yaml_char_t *)buf, + strlen(buf), 1, 1, YAML_PLAIN_SCALAR_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + } + + yaml_sequence_end_event_initialize(&event); + yaml_emitter_emit_or_die(emitter, &event); +} + +static void yaml_propval_string(yaml_emitter_t *emitter, char *str, int len) +{ + yaml_event_t event; + int i; + + assert(str[len-1] == '\0'); + + /* Make sure the entire string is in the lower 7-bit ascii range */ + for (i = 0; i < len; i++) + assert(isascii(str[i])); + + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t *)YAML_STR_TAG, (yaml_char_t*)str, + len-1, 0, 1, YAML_DOUBLE_QUOTED_SCALAR_STYLE); + yaml_emitter_emit_or_die(emitter, &event); +} + +static void yaml_propval(yaml_emitter_t *emitter, struct property *prop) +{ + yaml_event_t event; + int len = prop->val.len; + struct marker *m = prop->val.markers; + + /* Emit the property name */ + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t *)YAML_STR_TAG, (yaml_char_t*)prop->name, + strlen(prop->name), 1, 1, YAML_PLAIN_SCALAR_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + + /* Boolean properties are easiest to deal with. Length is zero, so just emit 'true' */ + if (len == 0) { + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t *)YAML_BOOL_TAG, + (yaml_char_t*)"true", + strlen("true"), 1, 0, YAML_PLAIN_SCALAR_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + return; + } + + if (!m) + die("No markers present in property '%s' value\n", prop->name); + + yaml_sequence_start_event_initialize(&event, NULL, + (yaml_char_t *)YAML_SEQ_TAG, 1, YAML_FLOW_SEQUENCE_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + + for_each_marker(m) { + int chunk_len; + char *data = &prop->val.val[m->offset]; + + if (m->type < TYPE_UINT8) + continue; + + chunk_len = type_marker_length(m) ? : len; + assert(chunk_len > 0); + len -= chunk_len; + + switch(m->type) { + case TYPE_UINT16: + yaml_propval_int(emitter, m, data, chunk_len, 2); + break; + case TYPE_UINT32: + yaml_propval_int(emitter, m, data, chunk_len, 4); + break; + case TYPE_UINT64: + yaml_propval_int(emitter, m, data, chunk_len, 8); + break; + case TYPE_STRING: + yaml_propval_string(emitter, data, chunk_len); + break; + default: + yaml_propval_int(emitter, m, data, chunk_len, 1); + break; + } + } + + yaml_sequence_end_event_initialize(&event); + yaml_emitter_emit_or_die(emitter, &event); +} + + +static void yaml_tree(struct node *tree, yaml_emitter_t *emitter) +{ + struct property *prop; + struct node *child; + yaml_event_t event; + + if (tree->deleted) + return; + + yaml_mapping_start_event_initialize(&event, NULL, + (yaml_char_t *)YAML_MAP_TAG, 1, YAML_ANY_MAPPING_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + + for_each_property(tree, prop) + yaml_propval(emitter, prop); + + /* Loop over all the children, emitting them into the map */ + for_each_child(tree, child) { + yaml_scalar_event_initialize(&event, NULL, + (yaml_char_t *)YAML_STR_TAG, (yaml_char_t*)child->name, + strlen(child->name), 1, 0, YAML_PLAIN_SCALAR_STYLE); + yaml_emitter_emit_or_die(emitter, &event); + yaml_tree(child, emitter); + } + + yaml_mapping_end_event_initialize(&event); + yaml_emitter_emit_or_die(emitter, &event); +} + +void dt_to_yaml(FILE *f, struct dt_info *dti) +{ + yaml_emitter_t emitter; + yaml_event_t event; + + yaml_emitter_initialize(&emitter); + yaml_emitter_set_output_file(&emitter, f); + yaml_stream_start_event_initialize(&event, YAML_UTF8_ENCODING); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_document_start_event_initialize(&event, NULL, NULL, NULL, 0); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_sequence_start_event_initialize(&event, NULL, (yaml_char_t *)YAML_SEQ_TAG, 1, YAML_ANY_SEQUENCE_STYLE); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_tree(dti->dt, &emitter); + + yaml_sequence_end_event_initialize(&event); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_document_end_event_initialize(&event, 0); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_stream_end_event_initialize(&event); + yaml_emitter_emit_or_die(&emitter, &event); + + yaml_emitter_delete(&emitter); +} diff --git a/scripts/extract-vmlinux b/scripts/extract-vmlinux index e6239f39abad..85e1f32fb4a0 100755 --- a/scripts/extract-vmlinux +++ b/scripts/extract-vmlinux @@ -48,9 +48,6 @@ fi tmp=$(mktemp /tmp/vmlinux-XXX) trap "rm -f $tmp" 0 -# Initial attempt for uncompressed images or objects: -check_vmlinux $img - # That didn't work, so retry after decompression. try_decompress '\037\213\010' xy gunzip try_decompress '\3757zXZ\000' abcde unxz @@ -60,5 +57,8 @@ try_decompress '\211\114\132' xy 'lzop -d' try_decompress '\002!L\030' xxx 'lz4 -d' try_decompress '(\265/\375' xxx unzstd +# Finally check for uncompressed images or objects: +check_vmlinux $img + # Bail out: echo "$me: Cannot find vmlinux." >&2 diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index cb0c889e13aa..0d5c799688f0 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -139,4 +139,55 @@ config GCC_PLUGIN_RANDSTRUCT_PERFORMANCE in structures. This reduces the performance hit of RANDSTRUCT at the cost of weakened randomization. +config GCC_PLUGIN_STACKLEAK + bool "Erase the kernel stack before returning from syscalls" + depends on GCC_PLUGINS + depends on HAVE_ARCH_STACKLEAK + help + This option makes the kernel erase the kernel stack before + returning from system calls. That reduces the information which + kernel stack leak bugs can reveal and blocks some uninitialized + stack variable attacks. + + The tradeoff is the performance impact: on a single CPU system kernel + compilation sees a 1% slowdown, other systems and workloads may vary + and you are advised to test this feature on your expected workload + before deploying it. + + This plugin was ported from grsecurity/PaX. More information at: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + +config STACKLEAK_TRACK_MIN_SIZE + int "Minimum stack frame size of functions tracked by STACKLEAK" + default 100 + range 0 4096 + depends on GCC_PLUGIN_STACKLEAK + help + The STACKLEAK gcc plugin instruments the kernel code for tracking + the lowest border of the kernel stack (and for some other purposes). + It inserts the stackleak_track_stack() call for the functions with + a stack frame size greater than or equal to this parameter. + If unsure, leave the default value 100. + +config STACKLEAK_METRICS + bool "Show STACKLEAK metrics in the /proc file system" + depends on GCC_PLUGIN_STACKLEAK + depends on PROC_FS + help + If this is set, STACKLEAK metrics for every task are available in + the /proc file system. In particular, /proc//stack_depth + shows the maximum kernel stack consumption for the current and + previous syscalls. Although this information is not precise, it + can be useful for estimating the STACKLEAK performance impact for + your workloads. + +config STACKLEAK_RUNTIME_DISABLE + bool "Allow runtime disabling of kernel stack erasing" + depends on GCC_PLUGIN_STACKLEAK + help + This option provides 'stack_erasing' sysctl, which can be used in + runtime to control kernel stack erasing for kernels built with + CONFIG_GCC_PLUGIN_STACKLEAK. + endif diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c new file mode 100644 index 000000000000..2f48da98b5d4 --- /dev/null +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -0,0 +1,427 @@ +/* + * Copyright 2011-2017 by the PaX Team + * Modified by Alexander Popov + * Licensed under the GPL v2 + * + * Note: the choice of the license means that the compilation process is + * NOT 'eligible' as defined by gcc's library exception to the GPL v3, + * but for the kernel it doesn't matter since it doesn't link against + * any of the gcc libraries + * + * This gcc plugin is needed for tracking the lowest border of the kernel stack. + * It instruments the kernel code inserting stackleak_track_stack() calls: + * - after alloca(); + * - for the functions with a stack frame size greater than or equal + * to the "track-min-size" plugin parameter. + * + * This plugin is ported from grsecurity/PaX. For more information see: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + * + * Debugging: + * - use fprintf() to stderr, debug_generic_expr(), debug_gimple_stmt(), + * print_rtl() and print_simple_rtl(); + * - add "-fdump-tree-all -fdump-rtl-all" to the plugin CFLAGS in + * Makefile.gcc-plugins to see the verbose dumps of the gcc passes; + * - use gcc -E to understand the preprocessing shenanigans; + * - use gcc with enabled CFG/GIMPLE/SSA verification (--enable-checking). + */ + +#include "gcc-common.h" + +__visible int plugin_is_GPL_compatible; + +static int track_frame_size = -1; +static const char track_function[] = "stackleak_track_stack"; + +/* + * Mark these global variables (roots) for gcc garbage collector since + * they point to the garbage-collected memory. + */ +static GTY(()) tree track_function_decl; + +static struct plugin_info stackleak_plugin_info = { + .version = "201707101337", + .help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n" + "disable\t\tdo not activate the plugin\n" +}; + +static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) +{ + gimple stmt; + gcall *stackleak_track_stack; + cgraph_node_ptr node; + int frequency; + basic_block bb; + + /* Insert call to void stackleak_track_stack(void) */ + stmt = gimple_build_call(track_function_decl, 0); + stackleak_track_stack = as_a_gcall(stmt); + if (after) { + gsi_insert_after(gsi, stackleak_track_stack, + GSI_CONTINUE_LINKING); + } else { + gsi_insert_before(gsi, stackleak_track_stack, GSI_SAME_STMT); + } + + /* Update the cgraph */ + bb = gimple_bb(stackleak_track_stack); + node = cgraph_get_create_node(track_function_decl); + gcc_assert(node); + frequency = compute_call_stmt_bb_frequency(current_function_decl, bb); + cgraph_create_edge(cgraph_get_node(current_function_decl), node, + stackleak_track_stack, bb->count, frequency); +} + +static bool is_alloca(gimple stmt) +{ + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA)) + return true; + +#if BUILDING_GCC_VERSION >= 4007 + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) + return true; +#endif + + return false; +} + +/* + * Work with the GIMPLE representation of the code. Insert the + * stackleak_track_stack() call after alloca() and into the beginning + * of the function if it is not instrumented. + */ +static unsigned int stackleak_instrument_execute(void) +{ + basic_block bb, entry_bb; + bool prologue_instrumented = false, is_leaf = true; + gimple_stmt_iterator gsi; + + /* + * ENTRY_BLOCK_PTR is a basic block which represents possible entry + * point of a function. This block does not contain any code and + * has a CFG edge to its successor. + */ + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + entry_bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + + /* + * Loop through the GIMPLE statements in each of cfun basic blocks. + * cfun is a global variable which represents the function that is + * currently processed. + */ + FOR_EACH_BB_FN(bb, cfun) { + for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { + gimple stmt; + + stmt = gsi_stmt(gsi); + + /* Leaf function is a function which makes no calls */ + if (is_gimple_call(stmt)) + is_leaf = false; + + if (!is_alloca(stmt)) + continue; + + /* Insert stackleak_track_stack() call after alloca() */ + stackleak_add_track_stack(&gsi, true); + if (bb == entry_bb) + prologue_instrumented = true; + } + } + + if (prologue_instrumented) + return 0; + + /* + * Special cases to skip the instrumentation. + * + * Taking the address of static inline functions materializes them, + * but we mustn't instrument some of them as the resulting stack + * alignment required by the function call ABI will break other + * assumptions regarding the expected (but not otherwise enforced) + * register clobbering ABI. + * + * Case in point: native_save_fl on amd64 when optimized for size + * clobbers rdx if it were instrumented here. + * + * TODO: any more special cases? + */ + if (is_leaf && + !TREE_PUBLIC(current_function_decl) && + DECL_DECLARED_INLINE_P(current_function_decl)) { + return 0; + } + + if (is_leaf && + !strncmp(IDENTIFIER_POINTER(DECL_NAME(current_function_decl)), + "_paravirt_", 10)) { + return 0; + } + + /* Insert stackleak_track_stack() call at the function beginning */ + bb = entry_bb; + if (!single_pred_p(bb)) { + /* gcc_assert(bb_loop_depth(bb) || + (bb->flags & BB_IRREDUCIBLE_LOOP)); */ + split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + } + gsi = gsi_after_labels(bb); + stackleak_add_track_stack(&gsi, false); + + return 0; +} + +static bool large_stack_frame(void) +{ +#if BUILDING_GCC_VERSION >= 8000 + return maybe_ge(get_frame_size(), track_frame_size); +#else + return (get_frame_size() >= track_frame_size); +#endif +} + +/* + * Work with the RTL representation of the code. + * Remove the unneeded stackleak_track_stack() calls from the functions + * which don't call alloca() and don't have a large enough stack frame size. + */ +static unsigned int stackleak_cleanup_execute(void) +{ + rtx_insn *insn, *next; + + if (cfun->calls_alloca) + return 0; + + if (large_stack_frame()) + return 0; + + /* + * Find stackleak_track_stack() calls. Loop through the chain of insns, + * which is an RTL representation of the code for a function. + * + * The example of a matching insn: + * (call_insn 8 4 10 2 (call (mem (symbol_ref ("stackleak_track_stack") + * [flags 0x41] ) + * [0 stackleak_track_stack S1 A8]) (0)) 675 {*call} (expr_list + * (symbol_ref ("stackleak_track_stack") [flags 0x41] ) (expr_list (0) (nil))) (nil)) + */ + for (insn = get_insns(); insn; insn = next) { + rtx body; + + next = NEXT_INSN(insn); + + /* Check the expression code of the insn */ + if (!CALL_P(insn)) + continue; + + /* + * Check the expression code of the insn body, which is an RTL + * Expression (RTX) describing the side effect performed by + * that insn. + */ + body = PATTERN(insn); + + if (GET_CODE(body) == PARALLEL) + body = XVECEXP(body, 0, 0); + + if (GET_CODE(body) != CALL) + continue; + + /* + * Check the first operand of the call expression. It should + * be a mem RTX describing the needed subroutine with a + * symbol_ref RTX. + */ + body = XEXP(body, 0); + if (GET_CODE(body) != MEM) + continue; + + body = XEXP(body, 0); + if (GET_CODE(body) != SYMBOL_REF) + continue; + + if (SYMBOL_REF_DECL(body) != track_function_decl) + continue; + + /* Delete the stackleak_track_stack() call */ + delete_insn_and_edges(insn); +#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION < 8000 + if (GET_CODE(next) == NOTE && + NOTE_KIND(next) == NOTE_INSN_CALL_ARG_LOCATION) { + insn = next; + next = NEXT_INSN(insn); + delete_insn_and_edges(insn); + } +#endif + } + + return 0; +} + +static bool stackleak_gate(void) +{ + tree section; + + section = lookup_attribute("section", + DECL_ATTRIBUTES(current_function_decl)); + if (section && TREE_VALUE(section)) { + section = TREE_VALUE(TREE_VALUE(section)); + + if (!strncmp(TREE_STRING_POINTER(section), ".init.text", 10)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".devinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".cpuinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".meminit.text", 13)) + return false; + } + + return track_frame_size >= 0; +} + +/* Build the function declaration for stackleak_track_stack() */ +static void stackleak_start_unit(void *gcc_data __unused, + void *user_data __unused) +{ + tree fntype; + + /* void stackleak_track_stack(void) */ + fntype = build_function_type_list(void_type_node, NULL_TREE); + track_function_decl = build_fn_decl(track_function, fntype); + DECL_ASSEMBLER_NAME(track_function_decl); /* for LTO */ + TREE_PUBLIC(track_function_decl) = 1; + TREE_USED(track_function_decl) = 1; + DECL_EXTERNAL(track_function_decl) = 1; + DECL_ARTIFICIAL(track_function_decl) = 1; + DECL_PRESERVE_P(track_function_decl) = 1; +} + +/* + * Pass gate function is a predicate function that gets executed before the + * corresponding pass. If the return value is 'true' the pass gets executed, + * otherwise, it is skipped. + */ +static bool stackleak_instrument_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_instrument +#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg +#define TODO_FLAGS_START TODO_verify_ssa | TODO_verify_flow | TODO_verify_stmts +#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \ + | TODO_update_ssa | TODO_rebuild_cgraph_edges +#include "gcc-generate-gimple-pass.h" + +static bool stackleak_cleanup_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_cleanup +#define TODO_FLAGS_FINISH TODO_dump_func +#include "gcc-generate-rtl-pass.h" + +/* + * Every gcc plugin exports a plugin_init() function that is called right + * after the plugin is loaded. This function is responsible for registering + * the plugin callbacks and doing other required initialization. + */ +__visible int plugin_init(struct plugin_name_args *plugin_info, + struct plugin_gcc_version *version) +{ + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + int i = 0; + + /* Extra GGC root tables describing our GTY-ed data */ + static const struct ggc_root_tab gt_ggc_r_gt_stackleak[] = { + { + .base = &track_function_decl, + .nelt = 1, + .stride = sizeof(track_function_decl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + /* + * The stackleak_instrument pass should be executed before the + * "optimized" pass, which is the control flow graph cleanup that is + * performed just before expanding gcc trees to the RTL. In former + * versions of the plugin this new pass was inserted before the + * "tree_profile" pass, which is currently called "profile". + */ + PASS_INFO(stackleak_instrument, "optimized", 1, + PASS_POS_INSERT_BEFORE); + + /* + * The stackleak_cleanup pass should be executed after the + * "reload" pass, when the stack frame size is final. + */ + PASS_INFO(stackleak_cleanup, "reload", 1, PASS_POS_INSERT_AFTER); + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + /* Parse the plugin arguments */ + for (i = 0; i < argc; i++) { + if (!strcmp(argv[i].key, "disable")) + return 0; + + if (!strcmp(argv[i].key, "track-min-size")) { + if (!argv[i].value) { + error(G_("no value supplied for option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + + track_frame_size = atoi(argv[i].value); + if (track_frame_size < 0) { + error(G_("invalid option argument '-fplugin-arg-%s-%s=%s'"), + plugin_name, argv[i].key, argv[i].value); + return 1; + } + } else { + error(G_("unknown option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + } + + /* Give the information about the plugin */ + register_callback(plugin_name, PLUGIN_INFO, NULL, + &stackleak_plugin_info); + + /* Register to be called before processing a translation unit */ + register_callback(plugin_name, PLUGIN_START_UNIT, + &stackleak_start_unit, NULL); + + /* Register an extra GCC garbage collector (GGC) root table */ + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, + (void *)>_ggc_r_gt_stackleak); + + /* + * Hook into the Pass Manager to register new gcc passes. + * + * The stack frame size info is available only at the last RTL pass, + * when it's too late to insert complex code like a function call. + * So we register two gcc passes to instrument every function at first + * and remove the unneeded instrumentation later. + */ + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_instrument_pass_info); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_cleanup_pass_info); + + return 0; +} diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index a9186a98a37d..109a1af7e444 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -48,8 +48,6 @@ static unsigned long long relative_base; static struct addr_range text_ranges[] = { { "_stext", "_etext" }, { "_sinittext", "_einittext" }, - { "_stext_l1", "_etext_l1" }, /* Blackfin on-chip L1 inst SRAM */ - { "_stext_l2", "_etext_l2" }, /* Blackfin on-chip L2 SRAM */ }; #define text_range_text (&text_ranges[0]) #define text_range_inittext (&text_ranges[1]) @@ -405,7 +403,7 @@ static void write_src(void) } output_label("kallsyms_num_syms"); - printf("\tPTR\t%u\n", table_cnt); + printf("\t.long\t%u\n", table_cnt); printf("\n"); /* table of offset markers, that give the offset in the compressed stream @@ -434,7 +432,7 @@ static void write_src(void) output_label("kallsyms_markers"); for (i = 0; i < ((table_cnt + 255) >> 8); i++) - printf("\tPTR\t%d\n", markers[i]); + printf("\t.long\t%u\n", markers[i]); printf("\n"); free(markers); diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 67ed9f6ccdf8..63b609243d03 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -68,21 +68,7 @@ PHONY += $(simple-targets) $(simple-targets): $(obj)/conf $< $(silent) --$@ $(Kconfig) -PHONY += oldnoconfig silentoldconfig savedefconfig defconfig - -# oldnoconfig is an alias of olddefconfig, because people already are dependent -# on its behavior (sets new symbols to their default value but not 'n') with the -# counter-intuitive name. -oldnoconfig: olddefconfig - @echo " WARNING: \"oldnoconfig\" target will be removed after Linux 4.19" - @echo " Please use \"olddefconfig\" instead, which is an alias." - -# We do not expect manual invokcation of "silentoldcofig" (or "syncconfig"). -silentoldconfig: syncconfig - @echo " WARNING: \"silentoldconfig\" has been renamed to \"syncconfig\"" - @echo " and is now an internal implementation detail." - @echo " What you want is probably \"oldconfig\"." - @echo " \"silentoldconfig\" will be removed after Linux 4.19" +PHONY += savedefconfig defconfig savedefconfig: $(obj)/conf $< $(silent) --$@=defconfig $(Kconfig) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 7b2b37260669..98e0c7a34699 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -460,12 +460,6 @@ static struct option long_opts[] = { {"randconfig", no_argument, NULL, randconfig}, {"listnewconfig", no_argument, NULL, listnewconfig}, {"olddefconfig", no_argument, NULL, olddefconfig}, - /* - * oldnoconfig is an alias of olddefconfig, because people already - * are dependent on its behavior(sets new symbols to their default - * value but not 'n') with the counter-intuitive name. - */ - {"oldnoconfig", no_argument, NULL, olddefconfig}, {NULL, 0, NULL, 0} }; @@ -480,7 +474,6 @@ static void conf_usage(const char *progname) printf(" --syncconfig Similar to oldconfig but generates configuration in\n" " include/{generated/,config/}\n"); printf(" --olddefconfig Same as oldconfig but sets new symbols to their default value\n"); - printf(" --oldnoconfig An alias of olddefconfig\n"); printf(" --defconfig New config with default defined in \n"); printf(" --savedefconfig Save the minimal current configuration to \n"); printf(" --allnoconfig New config where all options are answered with no\n"); diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh index 67d131447631..da66e7742282 100755 --- a/scripts/kconfig/merge_config.sh +++ b/scripts/kconfig/merge_config.sh @@ -33,12 +33,15 @@ usage() { echo " -n use allnoconfig instead of alldefconfig" echo " -r list redundant entries when merging fragments" echo " -O dir to put generated output files. Consider setting \$KCONFIG_CONFIG instead." + echo + echo "Used prefix: '$CONFIG_PREFIX'. You can redefine it with \$CONFIG_ environment variable." } RUNMAKE=true ALLTARGET=alldefconfig WARNREDUN=false OUTPUT=. +CONFIG_PREFIX=${CONFIG_-CONFIG_} while true; do case $1 in @@ -99,7 +102,8 @@ if [ ! -r "$INITFILE" ]; then fi MERGE_LIST=$* -SED_CONFIG_EXP="s/^\(# \)\{0,1\}\(CONFIG_[a-zA-Z0-9_]*\)[= ].*/\2/p" +SED_CONFIG_EXP="s/^\(# \)\{0,1\}\(${CONFIG_PREFIX}[a-zA-Z0-9_]*\)[= ].*/\2/p" + TMP_FILE=$(mktemp ./.tmp.config.XXXXXXXXXX) echo "Using $INITFILE as base" diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 8f0f508a78e9..ffbe901a37b5 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1904,13 +1904,13 @@ sub process_name($$) { ++$warnings; } - if ($identifier =~ m/^struct/) { + if ($identifier =~ m/^struct\b/) { $decl_type = 'struct'; - } elsif ($identifier =~ m/^union/) { + } elsif ($identifier =~ m/^union\b/) { $decl_type = 'union'; - } elsif ($identifier =~ m/^enum/) { + } elsif ($identifier =~ m/^enum\b/) { $decl_type = 'enum'; - } elsif ($identifier =~ m/^typedef/) { + } elsif ($identifier =~ m/^typedef\b/) { $decl_type = 'typedef'; } else { $decl_type = 'function'; diff --git a/scripts/mkmakefile b/scripts/mkmakefile index e19d6565f245..412f13fdff52 100755 --- a/scripts/mkmakefile +++ b/scripts/mkmakefile @@ -6,31 +6,20 @@ # Usage # $1 - Kernel src directory -# $2 - Output directory -# $3 - version -# $4 - patchlevel - -test ! -r $2/Makefile -o -O $2/Makefile || exit 0 # Only overwrite automatically generated Makefiles # (so we do not overwrite kernel Makefile) -if test -e $2/Makefile && ! grep -q Automatically $2/Makefile +if test -e Makefile && ! grep -q Automatically Makefile then exit 0 fi if [ "${quiet}" != "silent_" ]; then - echo " GEN $2/Makefile" + echo " GEN Makefile" fi -cat << EOF > $2/Makefile +cat << EOF > Makefile # Automatically generated by $0: don't edit -VERSION = $3 -PATCHLEVEL = $4 - -lastword = \$(word \$(words \$(1)),\$(1)) -makedir := \$(dir \$(call lastword,\$(MAKEFILE_LIST))) - ifeq ("\$(origin V)", "command line") VERBOSE := \$(V) endif @@ -38,15 +27,12 @@ ifneq (\$(VERBOSE),1) Q := @ endif -MAKEARGS := -C $1 -MAKEARGS += O=\$(if \$(patsubst /%,,\$(makedir)),\$(CURDIR)/)\$(patsubst %/,%,\$(makedir)) - MAKEFLAGS += --no-print-directory .PHONY: __sub-make \$(MAKECMDGOALS) __sub-make: - \$(Q)\$(MAKE) \$(MAKEARGS) \$(MAKECMDGOALS) + \$(Q)\$(MAKE) -C $1 O=\$(CURDIR) \$(MAKECMDGOALS) \$(filter-out __sub-make, \$(MAKECMDGOALS)): __sub-make @: diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 42c5d50f2bcc..a5b4af47987a 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -4,6 +4,8 @@ OBJECT_FILES_NON_STANDARD := y hostprogs-y := modpost mk_elfconfig always := $(hostprogs-y) empty.o +CFLAGS_REMOVE_empty.o := $(ASM_MACRO_FLAGS) + modpost-objs := modpost.o file2alias.o sumversion.o devicetable-offsets-file := devicetable-offsets.h diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 7be43697ff84..28a61665bb9c 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -95,12 +95,20 @@ extern struct devtable *__start___devtable[], *__stop___devtable[]; */ #define DEF_FIELD(m, devid, f) \ typeof(((struct devid *)0)->f) f = TO_NATIVE(*(typeof(f) *)((m) + OFF_##devid##_##f)) + +/* Define a variable v that holds the address of field f of struct devid + * based at address m. Due to the way typeof works, for a field of type + * T[N] the variable has type T(*)[N], _not_ T*. + */ +#define DEF_FIELD_ADDR_VAR(m, devid, f, v) \ + typeof(((struct devid *)0)->f) *v = ((m) + OFF_##devid##_##f) + /* Define a variable f that holds the address of field f of struct devid * based at address m. Due to the way typeof works, for a field of type * T[N] the variable has type T(*)[N], _not_ T*. */ #define DEF_FIELD_ADDR(m, devid, f) \ - typeof(((struct devid *)0)->f) *f = ((m) + OFF_##devid##_##f) + DEF_FIELD_ADDR_VAR(m, devid, f, f) /* Add a table entry. We test function type matches while we're here. */ #define ADD_TO_DEVTABLE(device_id, type, function) \ @@ -644,7 +652,7 @@ static void do_pnp_card_entries(void *symval, unsigned long size, for (i = 0; i < count; i++) { unsigned int j; - DEF_FIELD_ADDR(symval + i*id_size, pnp_card_device_id, devs); + DEF_FIELD_ADDR(symval + i * id_size, pnp_card_device_id, devs); for (j = 0; j < PNP_MAX_DEVICES; j++) { const char *id = (char *)(*devs)[j].id; @@ -656,10 +664,13 @@ static void do_pnp_card_entries(void *symval, unsigned long size, /* find duplicate, already added value */ for (i2 = 0; i2 < i && !dup; i2++) { - DEF_FIELD_ADDR(symval + i2*id_size, pnp_card_device_id, devs); + DEF_FIELD_ADDR_VAR(symval + i2 * id_size, + pnp_card_device_id, + devs, devs_dup); for (j2 = 0; j2 < PNP_MAX_DEVICES; j2++) { - const char *id2 = (char *)(*devs)[j2].id; + const char *id2 = + (char *)(*devs_dup)[j2].id; if (!id2[0]) break; @@ -1415,11 +1426,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) return; - /* All our symbols are of form __mod____device_table. */ - name = strstr(symname, "__mod_"); - if (!name) + /* All our symbols are of form __mod____device_table. */ + if (strncmp(symname, "__mod_", strlen("__mod_"))) return; - name += strlen("__mod_"); + name = symname + strlen("__mod_"); namelen = strlen(name); if (namelen < strlen("_device_table")) return; diff --git a/scripts/tags.sh b/scripts/tags.sh index 26de7d5aa5c8..4fa070f9231a 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -203,7 +203,7 @@ regex_c=( '/\i_private); + parent = aa_get_ns(dir->i_private); /* rmdir calls the generic securityfs functions to remove files * from the apparmor dir. It is up to the apparmor ns locking * to avoid races. diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 4285943f7260..d0afed9ebd0e 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -496,7 +496,7 @@ static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label, /* update caching of label on file_ctx */ spin_lock(&fctx->lock); old = rcu_dereference_protected(fctx->label, - spin_is_locked(&fctx->lock)); + lockdep_is_held(&fctx->lock)); l = aa_label_merge(old, label, GFP_ATOMIC); if (l) { if (l != old) { diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index e287b7d0d4be..265ae6641a06 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -151,6 +151,8 @@ static inline struct aa_label *begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); + might_sleep(); + if (label_is_stale(label)) { label = aa_get_newest_label(label); if (aa_replace_current_label(label) == 0) diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h index ec7228e857a9..7334ac966d01 100644 --- a/security/apparmor/include/net.h +++ b/security/apparmor/include/net.h @@ -83,6 +83,13 @@ struct aa_sk_ctx { __e; \ }) +struct aa_secmark { + u8 audit; + u8 deny; + u32 secid; + char *label; +}; + extern struct aa_sfs_entry aa_sfs_entry_network[]; void audit_net_cb(struct audit_buffer *ab, void *va); @@ -103,4 +110,7 @@ int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, struct socket *sock); +int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, + u32 secid, struct sock *sk); + #endif /* __AA_NET_H */ diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index ab64c6b5db5a..8e6707c837be 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -155,6 +155,9 @@ struct aa_profile { struct aa_rlimit rlimits; + int secmark_count; + struct aa_secmark *secmark; + struct aa_loaddata *rawdata; unsigned char *hash; char *dirname; diff --git a/security/apparmor/include/secid.h b/security/apparmor/include/secid.h index dee6fa3b6081..fa2062711b63 100644 --- a/security/apparmor/include/secid.h +++ b/security/apparmor/include/secid.h @@ -22,6 +22,9 @@ struct aa_label; /* secid value that will not be allocated */ #define AA_SECID_INVALID 0 +/* secid value that matches any other secid */ +#define AA_SECID_WILDCARD 1 + struct aa_label *aa_secid_to_label(u32 secid); int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 974affe50531..76491e7f4177 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -90,10 +90,12 @@ const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name, const char *end = fqname + n; const char *name = skipn_spaces(fqname, n); - if (!name) - return NULL; *ns_name = NULL; *ns_len = 0; + + if (!name) + return NULL; + if (name[0] == ':') { char *split = strnchr(&name[1], end - &name[1], ':'); *ns_name = skipn_spaces(&name[1], end - &name[1]); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8b8b70620bbe..42446a216f3b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include "include/apparmor.h" @@ -114,13 +116,13 @@ static int apparmor_ptrace_access_check(struct task_struct *child, struct aa_label *tracer, *tracee; int error; - tracer = begin_current_label_crit_section(); + tracer = __begin_current_label_crit_section(); tracee = aa_get_task_label(child); error = aa_may_ptrace(tracer, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); aa_put_label(tracee); - end_current_label_crit_section(tracer); + __end_current_label_crit_section(tracer); return error; } @@ -130,11 +132,11 @@ static int apparmor_ptrace_traceme(struct task_struct *parent) struct aa_label *tracer, *tracee; int error; - tracee = begin_current_label_crit_section(); + tracee = __begin_current_label_crit_section(); tracer = aa_get_task_label(parent); error = aa_may_ptrace(tracer, tracee, AA_PTRACE_TRACE); aa_put_label(tracer); - end_current_label_crit_section(tracee); + __end_current_label_crit_section(tracee); return error; } @@ -732,7 +734,7 @@ static int apparmor_task_setrlimit(struct task_struct *task, return error; } -static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, +static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { struct aa_label *cl, *tl; @@ -1020,6 +1022,7 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } +#ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_recv_skb - check perms before associating skb to sk * @@ -1030,8 +1033,15 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { - return 0; + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!skb->secmark) + return 0; + + return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, + skb->secmark, sk); } +#endif static struct aa_label *sk_peer_label(struct sock *sk) @@ -1126,6 +1136,20 @@ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) ctx->label = aa_get_current_label(); } +#ifdef CONFIG_NETWORK_SECMARK +static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!skb->secmark) + return 0; + + return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, + skb->secmark, sk); +} +#endif + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -1177,12 +1201,17 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), +#ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), +#endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), +#ifdef CONFIG_NETWORK_SECMARK + LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), +#endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), @@ -1538,6 +1567,97 @@ static inline int apparmor_init_sysctl(void) } #endif /* CONFIG_SYSCTL */ +#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) +static unsigned int apparmor_ip_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct aa_sk_ctx *ctx; + struct sock *sk; + + if (!skb->secmark) + return NF_ACCEPT; + + sk = skb_to_full_sk(skb); + if (sk == NULL) + return NF_ACCEPT; + + ctx = SK_CTX(sk); + if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, + skb->secmark, sk)) + return NF_ACCEPT; + + return NF_DROP_ERR(-ECONNREFUSED); + +} + +static unsigned int apparmor_ipv4_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return apparmor_ip_postroute(priv, skb, state); +} + +static unsigned int apparmor_ipv6_postroute(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return apparmor_ip_postroute(priv, skb, state); +} + +static const struct nf_hook_ops apparmor_nf_ops[] = { + { + .hook = apparmor_ipv4_postroute, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_SELINUX_FIRST, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .hook = apparmor_ipv6_postroute, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_SELINUX_FIRST, + }, +#endif +}; + +static int __net_init apparmor_nf_register(struct net *net) +{ + int ret; + + ret = nf_register_net_hooks(net, apparmor_nf_ops, + ARRAY_SIZE(apparmor_nf_ops)); + return ret; +} + +static void __net_exit apparmor_nf_unregister(struct net *net) +{ + nf_unregister_net_hooks(net, apparmor_nf_ops, + ARRAY_SIZE(apparmor_nf_ops)); +} + +static struct pernet_operations apparmor_net_ops = { + .init = apparmor_nf_register, + .exit = apparmor_nf_unregister, +}; + +static int __init apparmor_nf_ip_init(void) +{ + int err; + + if (!apparmor_enabled) + return 0; + + err = register_pernet_subsys(&apparmor_net_ops); + if (err) + panic("Apparmor: register_pernet_subsys: error %d\n", err); + + return 0; +} +__initcall(apparmor_nf_ip_init); +#endif + static int __init apparmor_init(void) { int error; @@ -1606,4 +1726,7 @@ alloc_out: return error; } -security_initcall(apparmor_init); +DEFINE_LSM(apparmor) = { + .name = "apparmor", + .init = apparmor_init, +}; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index bb24cfa0a164..c07fde444792 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -18,6 +18,7 @@ #include "include/label.h" #include "include/net.h" #include "include/policy.h" +#include "include/secid.h" #include "net_names.h" @@ -146,17 +147,20 @@ int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 request, struct sock *sk) { - struct aa_profile *profile; - DEFINE_AUDIT_SK(sa, op, sk); + int error = 0; AA_BUG(!label); AA_BUG(!sk); - if (unconfined(label)) - return 0; + if (!unconfined(label)) { + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); - return fn_for_each_confined(label, profile, - aa_profile_af_sk_perm(profile, &sa, request, sk)); + error = fn_for_each_confined(label, profile, + aa_profile_af_sk_perm(profile, &sa, request, sk)); + } + + return error; } int aa_sk_perm(const char *op, u32 request, struct sock *sk) @@ -185,3 +189,70 @@ int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, return aa_label_sk_perm(label, op, request, sock->sk); } + +#ifdef CONFIG_NETWORK_SECMARK +static int apparmor_secmark_init(struct aa_secmark *secmark) +{ + struct aa_label *label; + + if (secmark->label[0] == '*') { + secmark->secid = AA_SECID_WILDCARD; + return 0; + } + + label = aa_label_strn_parse(&root_ns->unconfined->label, + secmark->label, strlen(secmark->label), + GFP_ATOMIC, false, false); + + if (IS_ERR(label)) + return PTR_ERR(label); + + secmark->secid = label->secid; + + return 0; +} + +static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, + struct common_audit_data *sa, struct sock *sk) +{ + int i, ret; + struct aa_perms perms = { }; + + if (profile->secmark_count == 0) + return 0; + + for (i = 0; i < profile->secmark_count; i++) { + if (!profile->secmark[i].secid) { + ret = apparmor_secmark_init(&profile->secmark[i]); + if (ret) + return ret; + } + + if (profile->secmark[i].secid == secid || + profile->secmark[i].secid == AA_SECID_WILDCARD) { + if (profile->secmark[i].deny) + perms.deny = ALL_PERMS_MASK; + else + perms.allow = ALL_PERMS_MASK; + + if (profile->secmark[i].audit) + perms.audit = ALL_PERMS_MASK; + } + } + + aa_apply_modes_to_perms(profile, &perms); + + return aa_check_perms(profile, &perms, request, sa, audit_net_cb); +} + +int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, + u32 secid, struct sock *sk) +{ + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); + + return fn_for_each_confined(label, profile, + aa_secmark_perm(profile, request, secid, + &sa, sk)); +} +#endif diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 1590e2de4e84..df9c5890a878 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -231,6 +231,9 @@ void aa_free_profile(struct aa_profile *profile) for (i = 0; i < profile->xattr_count; i++) kzfree(profile->xattrs[i]); kzfree(profile->xattrs); + for (i = 0; i < profile->secmark_count; i++) + kzfree(profile->secmark[i].label); + kzfree(profile->secmark); kzfree(profile->dirname); aa_put_dfa(profile->xmatch); aa_put_dfa(profile->policy.dfa); diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 21cb384d712a..379682e2a8d5 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -292,6 +292,19 @@ fail: return 0; } +static bool unpack_u8(struct aa_ext *e, u8 *data, const char *name) +{ + if (unpack_nameX(e, AA_U8, name)) { + if (!inbounds(e, sizeof(u8))) + return 0; + if (data) + *data = get_unaligned((u8 *)e->pos); + e->pos += sizeof(u8); + return 1; + } + return 0; +} + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) { if (unpack_nameX(e, AA_U32, name)) { @@ -529,6 +542,49 @@ fail: return 0; } +static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) +{ + void *pos = e->pos; + int i, size; + + if (unpack_nameX(e, AA_STRUCT, "secmark")) { + size = unpack_array(e, NULL); + + profile->secmark = kcalloc(size, sizeof(struct aa_secmark), + GFP_KERNEL); + if (!profile->secmark) + goto fail; + + profile->secmark_count = size; + + for (i = 0; i < size; i++) { + if (!unpack_u8(e, &profile->secmark[i].audit, NULL)) + goto fail; + if (!unpack_u8(e, &profile->secmark[i].deny, NULL)) + goto fail; + if (!unpack_strdup(e, &profile->secmark[i].label, NULL)) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + goto fail; + } + + return 1; + +fail: + if (profile->secmark) { + for (i = 0; i < size; i++) + kfree(profile->secmark[i].label); + kfree(profile->secmark); + profile->secmark_count = 0; + } + + e->pos = pos; + return 0; +} + static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) { void *pos = e->pos; @@ -727,6 +783,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + if (!unpack_secmark(e, profile)) { + info = "failed to unpack profile secmark rules"; + goto fail; + } + if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c index 4ccec1bcf6f5..05373d9a3d6a 100644 --- a/security/apparmor/secid.c +++ b/security/apparmor/secid.c @@ -32,8 +32,7 @@ * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ - -#define AA_FIRST_SECID 1 +#define AA_FIRST_SECID 2 static DEFINE_IDR(aa_secids); static DEFINE_SPINLOCK(secid_lock); diff --git a/security/commoncap.c b/security/commoncap.c index 2e489d6a3ac8..18a4fdf6f6eb 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -684,9 +684,6 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_f } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); - if (rc == -EINVAL) - printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", - __func__, rc, bprm->filename); out: if (rc) diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 9bb0a7f2863e..5eacba858e4b 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -26,7 +26,7 @@ static struct key *keyring[INTEGRITY_KEYRING_MAX]; -static const char *keyring_name[INTEGRITY_KEYRING_MAX] = { +static const char * const keyring_name[INTEGRITY_KEYRING_MAX] = { #ifndef CONFIG_INTEGRITY_TRUSTED_KEYRING "_evm", "_ima", @@ -37,12 +37,6 @@ static const char *keyring_name[INTEGRITY_KEYRING_MAX] = { "_module", }; -#ifdef CONFIG_INTEGRITY_TRUSTED_KEYRING -static bool init_keyring __initdata = true; -#else -static bool init_keyring __initdata; -#endif - #ifdef CONFIG_IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY #define restrict_link_to_ima restrict_link_by_builtin_and_secondary_trusted #else @@ -85,7 +79,7 @@ int __init integrity_init_keyring(const unsigned int id) struct key_restriction *restriction; int err = 0; - if (!init_keyring) + if (!IS_ENABLED(CONFIG_INTEGRITY_TRUSTED_KEYRING)) return 0; restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 8a3905bb02c7..8c25f949ebdb 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -27,7 +27,7 @@ #define EVMKEY "evm-key" #define MAX_KEY_SIZE 128 static unsigned char evmkey[MAX_KEY_SIZE]; -static int evmkey_len = MAX_KEY_SIZE; +static const int evmkey_len = MAX_KEY_SIZE; struct crypto_shash *hmac_tfm; static struct crypto_shash *evm_tfm[HASH_ALGO__LAST]; @@ -38,7 +38,7 @@ static DEFINE_MUTEX(mutex); static unsigned long evm_set_key_flags; -static char * const evm_hmac = "hmac(sha1)"; +static const char evm_hmac[] = "hmac(sha1)"; /** * evm_set_key() - set EVM HMAC key from the kernel diff --git a/security/integrity/iint.c b/security/integrity/iint.c index 5a6810041e5c..1ea05da2323d 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "integrity.h" static struct rb_root integrity_iint_tree = RB_ROOT; @@ -174,7 +175,10 @@ static int __init integrity_iintcache_init(void) 0, SLAB_PANIC, init_once); return 0; } -security_initcall(integrity_iintcache_init); +DEFINE_LSM(integrity) = { + .name = "integrity", + .init = integrity_iintcache_init, +}; /* diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 67db9d9454ca..cc12f3449a72 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -88,7 +88,7 @@ struct ima_template_desc { char *name; char *fmt; int num_fields; - struct ima_template_field **fields; + const struct ima_template_field **fields; }; struct ima_template_entry { diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index a02c5acfd403..99dd1d53fc35 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -51,7 +51,8 @@ int ima_alloc_init_template(struct ima_event_data *event_data, (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { - struct ima_template_field *field = template_desc->fields[i]; + const struct ima_template_field *field = + template_desc->fields[i]; u32 len; result = field->field_init(event_data, diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 7e7e7e7c250a..d9e7728027c6 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -210,7 +210,7 @@ static int ima_calc_file_hash_atfm(struct file *file, { loff_t i_size, offset; char *rbuf[2] = { NULL, }; - int rc, read = 0, rbuf_len, active = 0, ahash_rc = 0; + int rc, rbuf_len, active = 0, ahash_rc = 0; struct ahash_request *req; struct scatterlist sg[1]; struct crypto_wait wait; @@ -257,11 +257,6 @@ static int ima_calc_file_hash_atfm(struct file *file, &rbuf_size[1], 0); } - if (!(file->f_mode & FMODE_READ)) { - file->f_mode |= FMODE_READ; - read = 1; - } - for (offset = 0; offset < i_size; offset += rbuf_len) { if (!rbuf[1] && offset) { /* Not using two buffers, and it is not the first @@ -300,8 +295,6 @@ static int ima_calc_file_hash_atfm(struct file *file, /* wait for the last update request to complete */ rc = ahash_wait(ahash_rc, &wait); out3: - if (read) - file->f_mode &= ~FMODE_READ; ima_free_pages(rbuf[0], rbuf_size[0]); ima_free_pages(rbuf[1], rbuf_size[1]); out2: @@ -336,7 +329,7 @@ static int ima_calc_file_hash_tfm(struct file *file, { loff_t i_size, offset = 0; char *rbuf; - int rc, read = 0; + int rc; SHASH_DESC_ON_STACK(shash, tfm); shash->tfm = tfm; @@ -357,11 +350,6 @@ static int ima_calc_file_hash_tfm(struct file *file, if (!rbuf) return -ENOMEM; - if (!(file->f_mode & FMODE_READ)) { - file->f_mode |= FMODE_READ; - read = 1; - } - while (offset < i_size) { int rbuf_len; @@ -378,8 +366,6 @@ static int ima_calc_file_hash_tfm(struct file *file, if (rc) break; } - if (read) - file->f_mode &= ~FMODE_READ; kfree(rbuf); out: if (!rc) @@ -420,6 +406,8 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) { loff_t i_size; int rc; + struct file *f = file; + bool new_file_instance = false, modified_flags = false; /* * For consistency, fail file's opened with the O_DIRECT flag on @@ -431,15 +419,41 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) return -EINVAL; } - i_size = i_size_read(file_inode(file)); + /* Open a new file instance in O_RDONLY if we cannot read */ + if (!(file->f_mode & FMODE_READ)) { + int flags = file->f_flags & ~(O_WRONLY | O_APPEND | + O_TRUNC | O_CREAT | O_NOCTTY | O_EXCL); + flags |= O_RDONLY; + f = dentry_open(&file->f_path, flags, file->f_cred); + if (IS_ERR(f)) { + /* + * Cannot open the file again, lets modify f_flags + * of original and continue + */ + pr_info_ratelimited("Unable to reopen file for reading.\n"); + f = file; + f->f_flags |= FMODE_READ; + modified_flags = true; + } else { + new_file_instance = true; + } + } + + i_size = i_size_read(file_inode(f)); if (ima_ahash_minsize && i_size >= ima_ahash_minsize) { - rc = ima_calc_file_ahash(file, hash); + rc = ima_calc_file_ahash(f, hash); if (!rc) - return 0; + goto out; } - return ima_calc_file_shash(file, hash); + rc = ima_calc_file_shash(f, hash); +out: + if (new_file_instance) + fput(f); + else if (modified_flags) + f->f_flags &= ~FMODE_READ; + return rc; } /* diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index ae9d5c766a3c..3183cc23d0f8 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -42,14 +42,14 @@ static int __init default_canonical_fmt_setup(char *str) __setup("ima_canonical_fmt", default_canonical_fmt_setup); static int valid_policy = 1; -#define TMPBUFLEN 12 + static ssize_t ima_show_htable_value(char __user *buf, size_t count, loff_t *ppos, atomic_long_t *val) { - char tmpbuf[TMPBUFLEN]; + char tmpbuf[32]; /* greater than largest 'long' string value */ ssize_t len; - len = scnprintf(tmpbuf, TMPBUFLEN, "%li\n", atomic_long_read(val)); + len = scnprintf(tmpbuf, sizeof(tmpbuf), "%li\n", atomic_long_read(val)); return simple_read_from_buffer(buf, count, ppos, tmpbuf, len); } @@ -179,7 +179,8 @@ int ima_measurements_show(struct seq_file *m, void *v) /* 6th: template specific data */ for (i = 0; i < e->template_desc->num_fields; i++) { enum ima_show_type show = IMA_SHOW_BINARY; - struct ima_template_field *field = e->template_desc->fields[i]; + const struct ima_template_field *field = + e->template_desc->fields[i]; if (is_ima_template && strcmp(field->field_id, "d") == 0) show = IMA_SHOW_BINARY_NO_FIELD_LEN; diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index faac9ecaa0ae..59d834219cd6 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -25,7 +25,7 @@ #include "ima.h" /* name for boot aggregate entry */ -static const char *boot_aggregate_name = "boot_aggregate"; +static const char boot_aggregate_name[] = "boot_aggregate"; struct tpm_chip *ima_tpm_chip; /* Add the boot aggregate to the IMA measurement list and extend diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 2d31921fbda4..1b88d58e1325 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -440,7 +440,7 @@ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) return 0; } -static int read_idmap[READING_MAX_ID] = { +static const int read_idmap[READING_MAX_ID] = { [READING_FIRMWARE] = FIRMWARE_CHECK, [READING_FIRMWARE_PREALLOC_BUFFER] = FIRMWARE_CHECK, [READING_MODULE] = MODULE_CHECK, diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 30db39b23804..b631b8bc7624 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -32,7 +32,7 @@ static struct ima_template_desc builtin_templates[] = { static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); -static struct ima_template_field supported_fields[] = { +static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, {.field_id = "n", .field_init = ima_eventname_init, @@ -49,7 +49,7 @@ static struct ima_template_field supported_fields[] = { static struct ima_template_desc *ima_template; static struct ima_template_desc *lookup_template_desc(const char *name); static int template_desc_init_fields(const char *template_fmt, - struct ima_template_field ***fields, + const struct ima_template_field ***fields, int *num_fields); static int __init ima_template_setup(char *str) @@ -125,7 +125,8 @@ static struct ima_template_desc *lookup_template_desc(const char *name) return found ? template_desc : NULL; } -static struct ima_template_field *lookup_template_field(const char *field_id) +static const struct ima_template_field * +lookup_template_field(const char *field_id) { int i; @@ -153,11 +154,11 @@ static int template_fmt_size(const char *template_fmt) } static int template_desc_init_fields(const char *template_fmt, - struct ima_template_field ***fields, + const struct ima_template_field ***fields, int *num_fields) { const char *template_fmt_ptr; - struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; + const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; int template_num_fields; int i, len; diff --git a/security/keys/Makefile b/security/keys/Makefile index ef1581b337a3..9cef54064f60 100644 --- a/security/keys/Makefile +++ b/security/keys/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_SYSCTL) += sysctl.o obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o obj-$(CONFIG_KEY_DH_OPERATIONS) += dh.o +obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += keyctl_pkey.o # # Key types diff --git a/security/keys/compat.c b/security/keys/compat.c index e87c89c0177c..9482df601dc3 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -141,6 +141,24 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, return keyctl_restrict_keyring(arg2, compat_ptr(arg3), compat_ptr(arg4)); + case KEYCTL_PKEY_QUERY: + if (arg3 != 0) + return -EINVAL; + return keyctl_pkey_query(arg2, + compat_ptr(arg4), + compat_ptr(arg5)); + + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + case KEYCTL_PKEY_SIGN: + return keyctl_pkey_e_d_s(option, + compat_ptr(arg2), compat_ptr(arg3), + compat_ptr(arg4), compat_ptr(arg5)); + + case KEYCTL_PKEY_VERIFY: + return keyctl_pkey_verify(compat_ptr(arg2), compat_ptr(arg3), + compat_ptr(arg4), compat_ptr(arg5)); + default: return -EOPNOTSUPP; } diff --git a/security/keys/internal.h b/security/keys/internal.h index 9f8208dc0e55..74cb0ff42fed 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -298,6 +298,45 @@ static inline long compat_keyctl_dh_compute( #endif #endif +#ifdef CONFIG_ASYMMETRIC_KEY_TYPE +extern long keyctl_pkey_query(key_serial_t, + const char __user *, + struct keyctl_pkey_query __user *); + +extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *, + const char __user *, + const void __user *, const void __user *); + +extern long keyctl_pkey_e_d_s(int, + const struct keyctl_pkey_params __user *, + const char __user *, + const void __user *, void __user *); +#else +static inline long keyctl_pkey_query(key_serial_t id, + const char __user *_info, + struct keyctl_pkey_query __user *_res) +{ + return -EOPNOTSUPP; +} + +static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params, + const char __user *_info, + const void __user *_in, + const void __user *_in2) +{ + return -EOPNOTSUPP; +} + +static inline long keyctl_pkey_e_d_s(int op, + const struct keyctl_pkey_params __user *params, + const char __user *_info, + const void __user *_in, + void __user *_out) +{ + return -EOPNOTSUPP; +} +#endif + /* * Debugging key validation */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 1ffe60bb2845..18619690ce77 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1747,6 +1747,30 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (const char __user *) arg3, (const char __user *) arg4); + case KEYCTL_PKEY_QUERY: + if (arg3 != 0) + return -EINVAL; + return keyctl_pkey_query((key_serial_t)arg2, + (const char __user *)arg4, + (struct keyctl_pkey_query *)arg5); + + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + case KEYCTL_PKEY_SIGN: + return keyctl_pkey_e_d_s( + option, + (const struct keyctl_pkey_params __user *)arg2, + (const char __user *)arg3, + (const void __user *)arg4, + (void __user *)arg5); + + case KEYCTL_PKEY_VERIFY: + return keyctl_pkey_verify( + (const struct keyctl_pkey_params __user *)arg2, + (const char __user *)arg3, + (const void __user *)arg4, + (const void __user *)arg5); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyctl_pkey.c b/security/keys/keyctl_pkey.c new file mode 100644 index 000000000000..783978842f13 --- /dev/null +++ b/security/keys/keyctl_pkey.c @@ -0,0 +1,323 @@ +/* Public-key operation keyctls + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static void keyctl_pkey_params_free(struct kernel_pkey_params *params) +{ + kfree(params->info); + key_put(params->key); +} + +enum { + Opt_err = -1, + Opt_enc, /* "enc=" eg. "enc=oaep" */ + Opt_hash, /* "hash=" eg. "hash=sha1" */ +}; + +static const match_table_t param_keys = { + { Opt_enc, "enc=%s" }, + { Opt_hash, "hash=%s" }, + { Opt_err, NULL } +}; + +/* + * Parse the information string which consists of key=val pairs. + */ +static int keyctl_pkey_params_parse(struct kernel_pkey_params *params) +{ + unsigned long token_mask = 0; + substring_t args[MAX_OPT_ARGS]; + char *c = params->info, *p, *q; + int token; + + while ((p = strsep(&c, " \t"))) { + if (*p == '\0' || *p == ' ' || *p == '\t') + continue; + token = match_token(p, param_keys, args); + if (__test_and_set_bit(token, &token_mask)) + return -EINVAL; + q = args[0].from; + if (!q[0]) + return -EINVAL; + + switch (token) { + case Opt_enc: + params->encoding = q; + break; + + case Opt_hash: + params->hash_algo = q; + break; + + default: + return -EINVAL; + } + } + + return 0; +} + +/* + * Interpret parameters. Callers must always call the free function + * on params, even if an error is returned. + */ +static int keyctl_pkey_params_get(key_serial_t id, + const char __user *_info, + struct kernel_pkey_params *params) +{ + key_ref_t key_ref; + void *p; + int ret; + + memset(params, 0, sizeof(*params)); + params->encoding = "raw"; + + p = strndup_user(_info, PAGE_SIZE); + if (IS_ERR(p)) + return PTR_ERR(p); + params->info = p; + + ret = keyctl_pkey_params_parse(params); + if (ret < 0) + return ret; + + key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + params->key = key_ref_to_ptr(key_ref); + + if (!params->key->type->asym_query) + return -EOPNOTSUPP; + + return 0; +} + +/* + * Get parameters from userspace. Callers must always call the free function + * on params, even if an error is returned. + */ +static int keyctl_pkey_params_get_2(const struct keyctl_pkey_params __user *_params, + const char __user *_info, + int op, + struct kernel_pkey_params *params) +{ + struct keyctl_pkey_params uparams; + struct kernel_pkey_query info; + int ret; + + memset(params, 0, sizeof(*params)); + params->encoding = "raw"; + + if (copy_from_user(&uparams, _params, sizeof(uparams)) != 0) + return -EFAULT; + + ret = keyctl_pkey_params_get(uparams.key_id, _info, params); + if (ret < 0) + return ret; + + ret = params->key->type->asym_query(params, &info); + if (ret < 0) + return ret; + + switch (op) { + case KEYCTL_PKEY_ENCRYPT: + case KEYCTL_PKEY_DECRYPT: + if (uparams.in_len > info.max_enc_size || + uparams.out_len > info.max_dec_size) + return -EINVAL; + break; + case KEYCTL_PKEY_SIGN: + case KEYCTL_PKEY_VERIFY: + if (uparams.in_len > info.max_sig_size || + uparams.out_len > info.max_data_size) + return -EINVAL; + break; + default: + BUG(); + } + + params->in_len = uparams.in_len; + params->out_len = uparams.out_len; + return 0; +} + +/* + * Query information about an asymmetric key. + */ +long keyctl_pkey_query(key_serial_t id, + const char __user *_info, + struct keyctl_pkey_query __user *_res) +{ + struct kernel_pkey_params params; + struct kernel_pkey_query res; + long ret; + + memset(¶ms, 0, sizeof(params)); + + ret = keyctl_pkey_params_get(id, _info, ¶ms); + if (ret < 0) + goto error; + + ret = params.key->type->asym_query(¶ms, &res); + if (ret < 0) + goto error; + + ret = -EFAULT; + if (copy_to_user(_res, &res, sizeof(res)) == 0 && + clear_user(_res->__spare, sizeof(_res->__spare)) == 0) + ret = 0; + +error: + keyctl_pkey_params_free(¶ms); + return ret; +} + +/* + * Encrypt/decrypt/sign + * + * Encrypt data, decrypt data or sign data using a public key. + * + * _info is a string of supplementary information in key=val format. For + * instance, it might contain: + * + * "enc=pkcs1 hash=sha256" + * + * where enc= specifies the encoding and hash= selects the OID to go in that + * particular encoding if required. If enc= isn't supplied, it's assumed that + * the caller is supplying raw values. + * + * If successful, the amount of data written into the output buffer is + * returned. + */ +long keyctl_pkey_e_d_s(int op, + const struct keyctl_pkey_params __user *_params, + const char __user *_info, + const void __user *_in, + void __user *_out) +{ + struct kernel_pkey_params params; + void *in, *out; + long ret; + + ret = keyctl_pkey_params_get_2(_params, _info, op, ¶ms); + if (ret < 0) + goto error_params; + + ret = -EOPNOTSUPP; + if (!params.key->type->asym_eds_op) + goto error_params; + + switch (op) { + case KEYCTL_PKEY_ENCRYPT: + params.op = kernel_pkey_encrypt; + break; + case KEYCTL_PKEY_DECRYPT: + params.op = kernel_pkey_decrypt; + break; + case KEYCTL_PKEY_SIGN: + params.op = kernel_pkey_sign; + break; + default: + BUG(); + } + + in = memdup_user(_in, params.in_len); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto error_params; + } + + ret = -ENOMEM; + out = kmalloc(params.out_len, GFP_KERNEL); + if (!out) + goto error_in; + + ret = params.key->type->asym_eds_op(¶ms, in, out); + if (ret < 0) + goto error_out; + + if (copy_to_user(_out, out, ret) != 0) + ret = -EFAULT; + +error_out: + kfree(out); +error_in: + kfree(in); +error_params: + keyctl_pkey_params_free(¶ms); + return ret; +} + +/* + * Verify a signature. + * + * Verify a public key signature using the given key, or if not given, search + * for a matching key. + * + * _info is a string of supplementary information in key=val format. For + * instance, it might contain: + * + * "enc=pkcs1 hash=sha256" + * + * where enc= specifies the signature blob encoding and hash= selects the OID + * to go in that particular encoding. If enc= isn't supplied, it's assumed + * that the caller is supplying raw values. + * + * If successful, 0 is returned. + */ +long keyctl_pkey_verify(const struct keyctl_pkey_params __user *_params, + const char __user *_info, + const void __user *_in, + const void __user *_in2) +{ + struct kernel_pkey_params params; + void *in, *in2; + long ret; + + ret = keyctl_pkey_params_get_2(_params, _info, KEYCTL_PKEY_VERIFY, + ¶ms); + if (ret < 0) + goto error_params; + + ret = -EOPNOTSUPP; + if (!params.key->type->asym_verify_signature) + goto error_params; + + in = memdup_user(_in, params.in_len); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto error_params; + } + + in2 = memdup_user(_in2, params.in2_len); + if (IS_ERR(in2)) { + ret = PTR_ERR(in2); + goto error_in; + } + + params.op = kernel_pkey_verify; + ret = params.key->type->asym_verify_signature(¶ms, in, in2); + + kfree(in2); +error_in: + kfree(in); +error_params: + keyctl_pkey_params_free(¶ms); + return ret; +} diff --git a/security/keys/trusted.c b/security/keys/trusted.c index b69d3b1777c2..ff6789365a12 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -30,7 +30,7 @@ #include #include -#include "trusted.h" +#include static const char hmac_alg[] = "hmac(sha1)"; static const char hash_alg[] = "sha1"; @@ -121,7 +121,7 @@ out: /* * calculate authorization info fields to send to TPM */ -static int TSS_authhmac(unsigned char *digest, const unsigned char *key, +int TSS_authhmac(unsigned char *digest, const unsigned char *key, unsigned int keylen, unsigned char *h1, unsigned char *h2, unsigned char h3, ...) { @@ -168,11 +168,12 @@ out: kzfree(sdesc); return ret; } +EXPORT_SYMBOL_GPL(TSS_authhmac); /* * verify the AUTH1_COMMAND (Seal) result from TPM */ -static int TSS_checkhmac1(unsigned char *buffer, +int TSS_checkhmac1(unsigned char *buffer, const uint32_t command, const unsigned char *ononce, const unsigned char *key, @@ -249,6 +250,7 @@ out: kzfree(sdesc); return ret; } +EXPORT_SYMBOL_GPL(TSS_checkhmac1); /* * verify the AUTH2_COMMAND (unseal) result from TPM @@ -355,7 +357,7 @@ out: * For key specific tpm requests, we will generate and send our * own TPM command packets using the drivers send function. */ -static int trusted_tpm_send(unsigned char *cmd, size_t buflen) +int trusted_tpm_send(unsigned char *cmd, size_t buflen) { int rc; @@ -367,6 +369,7 @@ static int trusted_tpm_send(unsigned char *cmd, size_t buflen) rc = -EPERM; return rc; } +EXPORT_SYMBOL_GPL(trusted_tpm_send); /* * Lock a trusted key, by extending a selected PCR. @@ -425,7 +428,7 @@ static int osap(struct tpm_buf *tb, struct osapsess *s, /* * Create an object independent authorisation protocol (oiap) session */ -static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) +int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) { int ret; @@ -442,6 +445,7 @@ static int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce) TPM_NONCE_SIZE); return 0; } +EXPORT_SYMBOL_GPL(oiap); struct tpm_digests { unsigned char encauth[SHA1_DIGEST_SIZE]; diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig index dd01aa91e521..a0d70d82b98e 100644 --- a/security/loadpin/Kconfig +++ b/security/loadpin/Kconfig @@ -10,10 +10,10 @@ config SECURITY_LOADPIN have a root filesystem backed by a read-only device such as dm-verity or a CDROM. -config SECURITY_LOADPIN_ENABLED +config SECURITY_LOADPIN_ENFORCE bool "Enforce LoadPin at boot" depends on SECURITY_LOADPIN help If selected, LoadPin will enforce pinning at boot. If not selected, it can be enabled at boot with the kernel parameter - "loadpin.enabled=1". + "loadpin.enforce=1". diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 0716af28808a..48f39631b370 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -44,7 +44,7 @@ static void report_load(const char *origin, struct file *file, char *operation) kfree(pathname); } -static int enabled = IS_ENABLED(CONFIG_SECURITY_LOADPIN_ENABLED); +static int enforce = IS_ENABLED(CONFIG_SECURITY_LOADPIN_ENFORCE); static struct super_block *pinned_root; static DEFINE_SPINLOCK(pinned_root_spinlock); @@ -60,8 +60,8 @@ static struct ctl_path loadpin_sysctl_path[] = { static struct ctl_table loadpin_sysctl_table[] = { { - .procname = "enabled", - .data = &enabled, + .procname = "enforce", + .data = &enforce, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -84,8 +84,11 @@ static void check_pinning_enforcement(struct super_block *mnt_sb) * device, allow sysctl to change modes for testing. */ if (mnt_sb->s_bdev) { + char bdev[BDEVNAME_SIZE]; + ro = bdev_read_only(mnt_sb->s_bdev); - pr_info("dev(%u,%u): %s\n", + bdevname(mnt_sb->s_bdev, bdev); + pr_info("%s (%u:%u): %s\n", bdev, MAJOR(mnt_sb->s_bdev->bd_dev), MINOR(mnt_sb->s_bdev->bd_dev), ro ? "read-only" : "writable"); @@ -97,7 +100,7 @@ static void check_pinning_enforcement(struct super_block *mnt_sb) loadpin_sysctl_table)) pr_notice("sysctl registration failed!\n"); else - pr_info("load pinning can be disabled.\n"); + pr_info("enforcement can be disabled.\n"); } else pr_info("load pinning engaged.\n"); } @@ -128,7 +131,7 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id) /* This handles the older init_module API that has a NULL file. */ if (!file) { - if (!enabled) { + if (!enforce) { report_load(origin, NULL, "old-api-pinning-ignored"); return 0; } @@ -151,7 +154,7 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id) * Unlock now since it's only pinned_root we care about. * In the worst case, we will (correctly) report pinning * failures before we have announced that pinning is - * enabled. This would be purely cosmetic. + * enforcing. This would be purely cosmetic. */ spin_unlock(&pinned_root_spinlock); check_pinning_enforcement(pinned_root); @@ -161,7 +164,7 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id) } if (IS_ERR_OR_NULL(pinned_root) || load_root != pinned_root) { - if (unlikely(!enabled)) { + if (unlikely(!enforce)) { report_load(origin, file, "pinning-ignored"); return 0; } @@ -186,10 +189,11 @@ static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = { void __init loadpin_add_hooks(void) { - pr_info("ready to pin (currently %sabled)", enabled ? "en" : "dis"); + pr_info("ready to pin (currently %senforcing)\n", + enforce ? "" : "not "); security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin"); } /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */ -module_param(enabled, int, 0); -MODULE_PARM_DESC(enabled, "Pin module/firmware loading (default: true)"); +module_param(enforce, int, 0); +MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning"); diff --git a/security/security.c b/security/security.c index 736e78da1ab9..04d173eb93f6 100644 --- a/security/security.c +++ b/security/security.c @@ -12,6 +12,8 @@ * (at your option) any later version. */ +#define pr_fmt(fmt) "LSM: " fmt + #include #include #include @@ -30,8 +32,6 @@ #include #include -#include - #define MAX_LSM_EVM_XATTR 2 /* Maximum number of letters for an LSM name string */ @@ -45,20 +45,22 @@ char *lsm_names; static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = CONFIG_DEFAULT_SECURITY; -static void __init do_security_initcalls(void) +static __initdata bool debug; +#define init_debug(...) \ + do { \ + if (debug) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static void __init major_lsm_init(void) { + struct lsm_info *lsm; int ret; - initcall_t call; - initcall_entry_t *ce; - - ce = __security_initcall_start; - trace_initcall_level("security"); - while (ce < __security_initcall_end) { - call = initcall_from_entry(ce); - trace_initcall_start(call); - ret = call(); - trace_initcall_finish(call, ret); - ce++; + + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + init_debug("initializing %s\n", lsm->name); + ret = lsm->init(); + WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } @@ -72,10 +74,11 @@ int __init security_init(void) int i; struct hlist_head *list = (struct hlist_head *) &security_hook_heads; + pr_info("Security Framework initializing\n"); + for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); i++) INIT_HLIST_HEAD(&list[i]); - pr_info("Security Framework initialized\n"); /* * Load minor LSMs, with the capability module always first. @@ -87,7 +90,7 @@ int __init security_init(void) /* * Load all the remaining security modules. */ - do_security_initcalls(); + major_lsm_init(); return 0; } @@ -100,6 +103,14 @@ static int __init choose_lsm(char *str) } __setup("security=", choose_lsm); +/* Enable LSM order debugging. */ +static int __init enable_debug(char *str) +{ + debug = true; + return 1; +} +__setup("lsm.debug", enable_debug); + static bool match_last_lsm(const char *list, const char *lsm) { const char *last; @@ -1147,7 +1158,7 @@ int security_task_movememory(struct task_struct *p) return call_int_hook(task_movememory, 0, p); } -int security_task_kill(struct task_struct *p, struct siginfo *info, +int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, 0, p, info, sig, cred); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ad9a9b8e9979..7ce683259357 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1508,6 +1508,11 @@ static int selinux_genfs_get_sid(struct dentry *dentry, } rc = security_genfs_sid(&selinux_state, sb->s_type->name, path, tclass, sid); + if (rc == -ENOENT) { + /* No match in policy, mark as unlabeled. */ + *sid = SECINITSID_UNLABELED; + rc = 0; + } } free_page((unsigned long)buffer); return rc; @@ -4186,7 +4191,7 @@ static int selinux_task_movememory(struct task_struct *p) PROCESS__SETSCHED, NULL); } -static int selinux_task_kill(struct task_struct *p, struct siginfo *info, +static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { u32 secid; @@ -7202,7 +7207,10 @@ void selinux_complete_init(void) /* SELinux requires early initialization in order to label all processes and objects when they are created. */ -security_initcall(selinux_init); +DEFINE_LSM(selinux) = { + .name = "selinux", + .init = selinux_init, +}; #if defined(CONFIG_NETFILTER) diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 39475fb455bc..2fe459df3c85 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -218,9 +218,7 @@ int mls_context_isvalid(struct policydb *p, struct context *c) /* * Set the MLS fields in the security context structure * `context' based on the string representation in - * the string `*scontext'. Update `*scontext' to - * point to the end of the string representation of - * the MLS fields. + * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. @@ -235,22 +233,21 @@ int mls_context_isvalid(struct policydb *p, struct context *c) */ int mls_context_to_sid(struct policydb *pol, char oldc, - char **scontext, + char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { - - char delim; - char *scontextp, *p, *rngptr; + char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; - int l, rc = -EINVAL; + int l, rc, i; + char *rangep[2]; if (!pol->mls_enabled) { - if (def_sid != SECSID_NULL && oldc) - *scontext += strlen(*scontext) + 1; - return 0; + if ((def_sid != SECSID_NULL && oldc) || (*scontext) == '\0') + return 0; + return -EINVAL; } /* @@ -261,113 +258,94 @@ int mls_context_to_sid(struct policydb *pol, struct context *defcon; if (def_sid == SECSID_NULL) - goto out; + return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) - goto out; + return -EINVAL; - rc = mls_context_cpy(context, defcon); - goto out; + return mls_context_cpy(context, defcon); } - /* Extract low sensitivity. */ - scontextp = p = *scontext; - while (*p && *p != ':' && *p != '-') - p++; - - delim = *p; - if (delim != '\0') - *p++ = '\0'; + /* + * If we're dealing with a range, figure out where the two parts + * of the range begin. + */ + rangep[0] = scontext; + rangep[1] = strchr(scontext, '-'); + if (rangep[1]) { + rangep[1][0] = '\0'; + rangep[1]++; + } + /* For each part of the range: */ for (l = 0; l < 2; l++) { - levdatum = hashtab_search(pol->p_levels.table, scontextp); - if (!levdatum) { - rc = -EINVAL; - goto out; - } + /* Split sensitivity and category set. */ + sensitivity = rangep[l]; + if (sensitivity == NULL) + break; + next_cat = strchr(sensitivity, ':'); + if (next_cat) + *(next_cat++) = '\0'; + /* Parse sensitivity. */ + levdatum = hashtab_search(pol->p_levels.table, sensitivity); + if (!levdatum) + return -EINVAL; context->range.level[l].sens = levdatum->level->sens; - if (delim == ':') { - /* Extract category set. */ - while (1) { - scontextp = p; - while (*p && *p != ',' && *p != '-') - p++; - delim = *p; - if (delim != '\0') - *p++ = '\0'; - - /* Separate into range if exists */ - rngptr = strchr(scontextp, '.'); - if (rngptr != NULL) { - /* Remove '.' */ - *rngptr++ = '\0'; - } + /* Extract category set. */ + while (next_cat != NULL) { + cur_cat = next_cat; + next_cat = strchr(next_cat, ','); + if (next_cat != NULL) + *(next_cat++) = '\0'; + + /* Separate into range if exists */ + rngptr = strchr(cur_cat, '.'); + if (rngptr != NULL) { + /* Remove '.' */ + *rngptr++ = '\0'; + } - catdatum = hashtab_search(pol->p_cats.table, - scontextp); - if (!catdatum) { - rc = -EINVAL; - goto out; - } + catdatum = hashtab_search(pol->p_cats.table, cur_cat); + if (!catdatum) + return -EINVAL; - rc = ebitmap_set_bit(&context->range.level[l].cat, - catdatum->value - 1, 1); - if (rc) - goto out; - - /* If range, set all categories in range */ - if (rngptr) { - int i; - - rngdatum = hashtab_search(pol->p_cats.table, rngptr); - if (!rngdatum) { - rc = -EINVAL; - goto out; - } - - if (catdatum->value >= rngdatum->value) { - rc = -EINVAL; - goto out; - } - - for (i = catdatum->value; i < rngdatum->value; i++) { - rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1); - if (rc) - goto out; - } - } + rc = ebitmap_set_bit(&context->range.level[l].cat, + catdatum->value - 1, 1); + if (rc) + return rc; + + /* If range, set all categories in range */ + if (rngptr == NULL) + continue; + + rngdatum = hashtab_search(pol->p_cats.table, rngptr); + if (!rngdatum) + return -EINVAL; + + if (catdatum->value >= rngdatum->value) + return -EINVAL; - if (delim != ',') - break; + for (i = catdatum->value; i < rngdatum->value; i++) { + rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1); + if (rc) + return rc; } } - if (delim == '-') { - /* Extract high sensitivity. */ - scontextp = p; - while (*p && *p != ':') - p++; - - delim = *p; - if (delim != '\0') - *p++ = '\0'; - } else - break; } - if (l == 0) { + /* If we didn't see a '-', the range start is also the range end. */ + if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) - goto out; + return rc; } - *scontext = ++p; - rc = 0; -out: - return rc; + + return 0; } /* @@ -379,21 +357,19 @@ out: int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { - char *tmpstr, *freestr; + char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; - /* we need freestr because mls_context_to_sid will change - the value of tmpstr */ - tmpstr = freestr = kstrdup(str, gfp_mask); + tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { - rc = mls_context_to_sid(p, ':', &tmpstr, context, + rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); - kfree(freestr); + kfree(tmpstr); } return rc; diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h index 9a3ff7af70ad..67093647576d 100644 --- a/security/selinux/ss/mls.h +++ b/security/selinux/ss/mls.h @@ -34,7 +34,7 @@ int mls_level_isvalid(struct policydb *p, struct mls_level *l); int mls_context_to_sid(struct policydb *p, char oldc, - char **scontext, + char *scontext, struct context *context, struct sidtab *s, u32 def_sid); diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index e9394e7adc84..f4eadd3f7350 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -1101,7 +1101,7 @@ static int str_read(char **strp, gfp_t flags, void *fp, u32 len) if ((len == 0) || (len == (u32)-1)) return -EINVAL; - str = kmalloc(len + 1, flags); + str = kmalloc(len + 1, flags | __GFP_NOWARN); if (!str) return -ENOMEM; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f3def298a90e..12e414394530 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1365,7 +1365,6 @@ int security_sid_to_context_force(struct selinux_state *state, u32 sid, static int string_to_context_struct(struct policydb *pol, struct sidtab *sidtabp, char *scontext, - u32 scontext_len, struct context *ctx, u32 def_sid) { @@ -1426,15 +1425,12 @@ static int string_to_context_struct(struct policydb *pol, ctx->type = typdatum->value; - rc = mls_context_to_sid(pol, oldc, &p, ctx, sidtabp, def_sid); + rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid); if (rc) goto out; - rc = -EINVAL; - if ((p - scontext) < scontext_len) - goto out; - /* Check the validity of the new context. */ + rc = -EINVAL; if (!policydb_context_isvalid(pol, ctx)) goto out; rc = 0; @@ -1489,7 +1485,7 @@ static int security_context_to_sid_core(struct selinux_state *state, policydb = &state->ss->policydb; sidtab = &state->ss->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, - scontext_len, &context, def_sid); + &context, def_sid); if (rc == -EINVAL && force) { context.str = str; context.len = strlen(str) + 1; @@ -1958,7 +1954,7 @@ static int convert_context(u32 key, goto out; rc = string_to_context_struct(args->newp, NULL, s, - c->len, &ctx, SECSID_NULL); + &ctx, SECSID_NULL); kfree(s); if (!rc) { pr_info("SELinux: Context %s became valid (mapped).\n", diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 340fc30ad85d..81fb4c1631e9 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -421,6 +421,7 @@ static int smk_ptrace_rule_check(struct task_struct *tracer, struct smk_audit_info ad, *saip = NULL; struct task_smack *tsp; struct smack_known *tracer_known; + const struct cred *tracercred; if ((mode & PTRACE_MODE_NOAUDIT) == 0) { smk_ad_init(&ad, func, LSM_AUDIT_DATA_TASK); @@ -429,7 +430,8 @@ static int smk_ptrace_rule_check(struct task_struct *tracer, } rcu_read_lock(); - tsp = __task_cred(tracer)->security; + tracercred = __task_cred(tracer); + tsp = tracercred->security; tracer_known = smk_of_task(tsp); if ((mode & PTRACE_MODE_ATTACH) && @@ -439,7 +441,7 @@ static int smk_ptrace_rule_check(struct task_struct *tracer, rc = 0; else if (smack_ptrace_rule == SMACK_PTRACE_DRACONIAN) rc = -EACCES; - else if (capable(CAP_SYS_PTRACE)) + else if (smack_privileged_cred(CAP_SYS_PTRACE, tracercred)) rc = 0; else rc = -EACCES; @@ -1841,6 +1843,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, { struct smack_known *skp; struct smack_known *tkp = smk_of_task(tsk->cred->security); + const struct cred *tcred; struct file *file; int rc; struct smk_audit_info ad; @@ -1854,8 +1857,12 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, skp = file->f_security; rc = smk_access(skp, tkp, MAY_DELIVER, NULL); rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc); - if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE)) + + rcu_read_lock(); + tcred = __task_cred(tsk); + if (rc != 0 && smack_privileged_cred(CAP_MAC_OVERRIDE, tcred)) rc = 0; + rcu_read_unlock(); smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, tsk); @@ -2251,7 +2258,7 @@ static int smack_task_movememory(struct task_struct *p) * Return 0 if write access is permitted * */ -static int smack_task_kill(struct task_struct *p, struct siginfo *info, +static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { struct smk_audit_info ad; @@ -3467,7 +3474,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) */ final = &smack_known_star; /* - * No break. + * Fall through. * * If a smack value has been set we want to use it, * but since tmpfs isn't giving us the opportunity @@ -4882,4 +4889,7 @@ static __init int smack_init(void) * Smack requires early initialization in order to label * all processes and objects when they are created. */ -security_initcall(smack_init); +DEFINE_LSM(smack) = { + .name = "smack", + .init = smack_init, +}; diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index f6482e53d55a..06b517075ec0 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -2853,7 +2853,6 @@ static const struct file_operations smk_ptrace_ops = { static int smk_fill_super(struct super_block *sb, void *data, int silent) { int rc; - struct inode *root_inode; static const struct tree_descr smack_files[] = { [SMK_LOAD] = { @@ -2917,8 +2916,6 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent) return rc; } - root_inode = d_inode(sb->s_root); - return 0; } diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 03923a138ef5..9b38f94b5dd0 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -1660,7 +1660,8 @@ static void tomoyo_read_pid(struct tomoyo_io_buffer *head) head->r.eof = true; if (tomoyo_str_starts(&buf, "global-pid ")) global_pid = true; - pid = (unsigned int) simple_strtoul(buf, NULL, 10); + if (kstrtouint(buf, 10, &pid)) + return; rcu_read_lock(); if (global_pid) p = find_task_by_pid_ns(pid, &init_pid_ns); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 9f932e2d6852..1b5b5097efd7 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -550,4 +550,7 @@ static int __init tomoyo_init(void) return 0; } -security_initcall(tomoyo_init); +DEFINE_LSM(tomoyo) = { + .name = "tomoyo", + .init = tomoyo_init, +}; diff --git a/sound/aoa/soundbus/i2sbus/core.c b/sound/aoa/soundbus/i2sbus/core.c index 000b58522106..bd7c5029fc59 100644 --- a/sound/aoa/soundbus/i2sbus/core.c +++ b/sound/aoa/soundbus/i2sbus/core.c @@ -157,18 +157,19 @@ static int i2sbus_add_dev(struct macio_dev *macio, struct device_node *child = NULL, *sound = NULL; struct resource *r; int i, layout = 0, rlen, ok = force; - static const char *rnames[] = { "i2sbus: %s (control)", - "i2sbus: %s (tx)", - "i2sbus: %s (rx)" }; + char node_name[6]; + static const char *rnames[] = { "i2sbus: %pOFn (control)", + "i2sbus: %pOFn (tx)", + "i2sbus: %pOFn (rx)" }; static irq_handler_t ints[] = { i2sbus_bus_intr, i2sbus_tx_intr, i2sbus_rx_intr }; - if (strlen(np->name) != 5) + if (snprintf(node_name, sizeof(node_name), "%pOFn", np) != 5) return 0; - if (strncmp(np->name, "i2s-", 4)) + if (strncmp(node_name, "i2s-", 4)) return 0; dev = kzalloc(sizeof(struct i2sbus_dev), GFP_KERNEL); @@ -228,13 +229,13 @@ static int i2sbus_add_dev(struct macio_dev *macio, dev->sound.pcmid = -1; dev->macio = macio; dev->control = control; - dev->bus_number = np->name[4] - 'a'; + dev->bus_number = node_name[4] - 'a'; INIT_LIST_HEAD(&dev->sound.codec_list); for (i = aoa_resource_i2smmio; i <= aoa_resource_rxdbdma; i++) { dev->interrupts[i] = -1; snprintf(dev->rnames[i], sizeof(dev->rnames[i]), - rnames[i], np->name); + rnames[i], np); } for (i = aoa_resource_i2smmio; i <= aoa_resource_rxdbdma; i++) { int irq = irq_of_parse_and_map(np, i); diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig index 5fbd47a9177e..28867732a318 100644 --- a/sound/arm/Kconfig +++ b/sound/arm/Kconfig @@ -31,7 +31,6 @@ endif # SND_ARM config SND_PXA2XX_LIB tristate - select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 select SND_DMAENGINE_PCM config SND_PXA2XX_LIB_AC97 diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 753d5fc4b284..59a4adc286ed 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_X86 +#include +#endif #include /* @@ -82,31 +85,32 @@ EXPORT_SYMBOL(snd_free_pages); #ifdef CONFIG_HAS_DMA /* allocate the coherent DMA pages */ -static void *snd_malloc_dev_pages(struct device *dev, size_t size, dma_addr_t *dma) +static void snd_malloc_dev_pages(struct snd_dma_buffer *dmab, size_t size) { - int pg; gfp_t gfp_flags; - if (WARN_ON(!dma)) - return NULL; - pg = get_order(size); gfp_flags = GFP_KERNEL | __GFP_COMP /* compound page lets parts be mapped */ | __GFP_NORETRY /* don't trigger OOM-killer */ | __GFP_NOWARN; /* no stack trace print - this call is non-critical */ - return dma_alloc_coherent(dev, PAGE_SIZE << pg, dma, gfp_flags); + dmab->area = dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, + gfp_flags); +#ifdef CONFIG_X86 + if (dmab->area && dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC) + set_memory_wc((unsigned long)dmab->area, + PAGE_ALIGN(size) >> PAGE_SHIFT); +#endif } /* free the coherent DMA pages */ -static void snd_free_dev_pages(struct device *dev, size_t size, void *ptr, - dma_addr_t dma) +static void snd_free_dev_pages(struct snd_dma_buffer *dmab) { - int pg; - - if (ptr == NULL) - return; - pg = get_order(size); - dma_free_coherent(dev, PAGE_SIZE << pg, ptr, dma); +#ifdef CONFIG_X86 + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC) + set_memory_wb((unsigned long)dmab->area, + PAGE_ALIGN(dmab->bytes) >> PAGE_SHIFT); +#endif + dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } #ifdef CONFIG_GENERIC_ALLOCATOR @@ -199,12 +203,15 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; #endif /* CONFIG_GENERIC_ALLOCATOR */ + /* fall through */ case SNDRV_DMA_TYPE_DEV: - dmab->area = snd_malloc_dev_pages(device, size, &dmab->addr); + case SNDRV_DMA_TYPE_DEV_UC: + snd_malloc_dev_pages(dmab, size); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: + case SNDRV_DMA_TYPE_DEV_UC_SG: snd_malloc_sgbuf_pages(device, size, dmab, NULL); break; #endif @@ -275,11 +282,13 @@ void snd_dma_free_pages(struct snd_dma_buffer *dmab) break; #endif /* CONFIG_GENERIC_ALLOCATOR */ case SNDRV_DMA_TYPE_DEV: - snd_free_dev_pages(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); + case SNDRV_DMA_TYPE_DEV_UC: + snd_free_dev_pages(dmab); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: + case SNDRV_DMA_TYPE_DEV_UC_SG: snd_free_sgbuf_pages(dmab); break; #endif diff --git a/sound/core/oss/pcm_plugin.c b/sound/core/oss/pcm_plugin.c index 0391cb1a4f19..141c5f3a9575 100644 --- a/sound/core/oss/pcm_plugin.c +++ b/sound/core/oss/pcm_plugin.c @@ -111,7 +111,7 @@ int snd_pcm_plug_alloc(struct snd_pcm_substream *plug, snd_pcm_uframes_t frames) while (plugin->next) { if (plugin->dst_frames) frames = plugin->dst_frames(plugin, frames); - if (snd_BUG_ON(frames <= 0)) + if (snd_BUG_ON((snd_pcm_sframes_t)frames <= 0)) return -ENXIO; plugin = plugin->next; err = snd_pcm_plugin_alloc(plugin, frames); @@ -123,7 +123,7 @@ int snd_pcm_plug_alloc(struct snd_pcm_substream *plug, snd_pcm_uframes_t frames) while (plugin->prev) { if (plugin->src_frames) frames = plugin->src_frames(plugin, frames); - if (snd_BUG_ON(frames <= 0)) + if (snd_BUG_ON((snd_pcm_sframes_t)frames <= 0)) return -ENXIO; plugin = plugin->prev; err = snd_pcm_plugin_alloc(plugin, frames); diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 4e6110d778bd..40013b26f671 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -2172,18 +2172,25 @@ snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream, if (err < 0) goto _end_unlock; + runtime->twake = runtime->control->avail_min ? : 1; + if (runtime->status->state == SNDRV_PCM_STATE_RUNNING) + snd_pcm_update_hw_ptr(substream); + if (!is_playback && - runtime->status->state == SNDRV_PCM_STATE_PREPARED && - size >= runtime->start_threshold) { - err = snd_pcm_start(substream); - if (err < 0) + runtime->status->state == SNDRV_PCM_STATE_PREPARED) { + if (size >= runtime->start_threshold) { + err = snd_pcm_start(substream); + if (err < 0) + goto _end_unlock; + } else { + /* nothing to do */ + err = 0; goto _end_unlock; + } } - runtime->twake = runtime->control->avail_min ? : 1; - if (runtime->status->state == SNDRV_PCM_STATE_RUNNING) - snd_pcm_update_hw_ptr(substream); avail = snd_pcm_avail(substream); + while (size > 0) { snd_pcm_uframes_t frames, appl_ptr, appl_ofs; snd_pcm_uframes_t cont; diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 08d5662039e3..ee601d7f0926 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -1236,6 +1236,28 @@ int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream, } EXPORT_SYMBOL(snd_rawmidi_transmit); +/** + * snd_rawmidi_proceed - Discard the all pending bytes and proceed + * @substream: rawmidi substream + * + * Return: the number of discarded bytes + */ +int snd_rawmidi_proceed(struct snd_rawmidi_substream *substream) +{ + struct snd_rawmidi_runtime *runtime = substream->runtime; + unsigned long flags; + int count = 0; + + spin_lock_irqsave(&runtime->lock, flags); + if (runtime->avail < runtime->buffer_size) { + count = runtime->buffer_size - runtime->avail; + __snd_rawmidi_transmit_ack(substream, count); + } + spin_unlock_irqrestore(&runtime->lock, flags); + return count; +} +EXPORT_SYMBOL(snd_rawmidi_proceed); + static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream, const unsigned char __user *userbuf, const unsigned char *kernelbuf, diff --git a/sound/core/seq/oss/seq_oss_timer.c b/sound/core/seq/oss/seq_oss_timer.c index ba127c22539a..0778d28421da 100644 --- a/sound/core/seq/oss/seq_oss_timer.c +++ b/sound/core/seq/oss/seq_oss_timer.c @@ -92,7 +92,7 @@ snd_seq_oss_process_timer_event(struct seq_oss_timer *rec, union evrec *ev) case TMR_WAIT_REL: parm += rec->cur_tick; rec->realtime = 0; - /* fall through and continue to next */ + /* fall through */ case TMR_WAIT_ABS: if (parm == 0) { rec->realtime = 1; diff --git a/sound/core/seq/seq_system.c b/sound/core/seq/seq_system.c index 8ce1d0b40dce..0dc5d5a45ecc 100644 --- a/sound/core/seq/seq_system.c +++ b/sound/core/seq/seq_system.c @@ -123,6 +123,7 @@ int __init snd_seq_system_client_init(void) { struct snd_seq_port_callback pcallbacks; struct snd_seq_port_info *port; + int err; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -134,6 +135,10 @@ int __init snd_seq_system_client_init(void) /* register client */ sysclient = snd_seq_create_kernel_client(NULL, 0, "System"); + if (sysclient < 0) { + kfree(port); + return sysclient; + } /* register timer */ strcpy(port->name, "Timer"); @@ -144,7 +149,10 @@ int __init snd_seq_system_client_init(void) port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; port->addr.client = sysclient; port->addr.port = SNDRV_SEQ_PORT_SYSTEM_TIMER; - snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, port); + err = snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, + port); + if (err < 0) + goto error_port; /* register announcement port */ strcpy(port->name, "Announce"); @@ -154,16 +162,24 @@ int __init snd_seq_system_client_init(void) port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; port->addr.client = sysclient; port->addr.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; - snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, port); + err = snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, + port); + if (err < 0) + goto error_port; announce_port = port->addr.port; kfree(port); return 0; + + error_port: + snd_seq_system_client_done(); + kfree(port); + return err; } /* unregister our internal client */ -void __exit snd_seq_system_client_done(void) +void snd_seq_system_client_done(void) { int oldsysclient = sysclient; diff --git a/sound/core/seq/seq_virmidi.c b/sound/core/seq/seq_virmidi.c index cb988efd1ed0..e5a40795914a 100644 --- a/sound/core/seq/seq_virmidi.c +++ b/sound/core/seq/seq_virmidi.c @@ -149,9 +149,7 @@ static void snd_vmidi_output_work(struct work_struct *work) /* discard the outputs in dispatch mode unless subscribed */ if (vmidi->seq_mode == SNDRV_VIRMIDI_SEQ_DISPATCH && !(vmidi->rdev->flags & SNDRV_VIRMIDI_SUBSCRIBE)) { - char buf[32]; - while (snd_rawmidi_transmit(substream, buf, sizeof(buf)) > 0) - ; /* ignored */ + snd_rawmidi_proceed(substream); return; } diff --git a/sound/core/sgbuf.c b/sound/core/sgbuf.c index 84fffabdd129..c1cfaa01a5cb 100644 --- a/sound/core/sgbuf.c +++ b/sound/core/sgbuf.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -43,6 +44,8 @@ int snd_free_sgbuf_pages(struct snd_dma_buffer *dmab) dmab->area = NULL; tmpb.dev.type = SNDRV_DMA_TYPE_DEV; + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC_SG) + tmpb.dev.type = SNDRV_DMA_TYPE_DEV_UC; tmpb.dev.dev = sgbuf->dev; for (i = 0; i < sgbuf->pages; i++) { if (!(sgbuf->table[i].addr & ~PAGE_MASK)) @@ -72,12 +75,20 @@ void *snd_malloc_sgbuf_pages(struct device *device, struct snd_dma_buffer tmpb; struct snd_sg_page *table; struct page **pgtable; + int type = SNDRV_DMA_TYPE_DEV; + pgprot_t prot = PAGE_KERNEL; dmab->area = NULL; dmab->addr = 0; dmab->private_data = sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (! sgbuf) return NULL; + if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_UC_SG) { + type = SNDRV_DMA_TYPE_DEV_UC; +#ifdef pgprot_noncached + prot = pgprot_noncached(PAGE_KERNEL); +#endif + } sgbuf->dev = device; pages = snd_sgbuf_aligned_pages(size); sgbuf->tblsize = sgbuf_align_table(pages); @@ -98,7 +109,7 @@ void *snd_malloc_sgbuf_pages(struct device *device, if (chunk > maxpages) chunk = maxpages; chunk <<= PAGE_SHIFT; - if (snd_dma_alloc_pages_fallback(SNDRV_DMA_TYPE_DEV, device, + if (snd_dma_alloc_pages_fallback(type, device, chunk, &tmpb) < 0) { if (!sgbuf->pages) goto _failed; @@ -125,7 +136,7 @@ void *snd_malloc_sgbuf_pages(struct device *device, } sgbuf->size = size; - dmab->area = vmap(sgbuf->page_table, sgbuf->pages, VM_MAP, PAGE_KERNEL); + dmab->area = vmap(sgbuf->page_table, sgbuf->pages, VM_MAP, prot); if (! dmab->area) goto _failed; if (res_size) diff --git a/sound/firewire/Kconfig b/sound/firewire/Kconfig index 529d9f405fa9..8a146b039276 100644 --- a/sound/firewire/Kconfig +++ b/sound/firewire/Kconfig @@ -147,7 +147,9 @@ config SND_FIREWIRE_MOTU help Say Y here to enable support for FireWire devices which MOTU produced: * 828mk2 + * Traveler * 828mk3 + * Audio Express To compile this driver as a module, choose M here: the module will be called snd-firewire-motu. diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index cb9acfe60f6a..9be76c808fcc 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -140,6 +140,28 @@ const unsigned int amdtp_rate_table[CIP_SFC_COUNT] = { }; EXPORT_SYMBOL(amdtp_rate_table); +static int apply_constraint_to_size(struct snd_pcm_hw_params *params, + struct snd_pcm_hw_rule *rule) +{ + struct snd_interval *s = hw_param_interval(params, rule->var); + const struct snd_interval *r = + hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_RATE); + struct snd_interval t = {0}; + unsigned int step = 0; + int i; + + for (i = 0; i < CIP_SFC_COUNT; ++i) { + if (snd_interval_test(r, amdtp_rate_table[i])) + step = max(step, amdtp_syt_intervals[i]); + } + + t.min = roundup(s->min, step); + t.max = rounddown(s->max, step); + t.integer = 1; + + return snd_interval_refine(s, &t); +} + /** * amdtp_stream_add_pcm_hw_constraints - add hw constraints for PCM substream * @s: the AMDTP stream, which must be initialized. @@ -194,16 +216,19 @@ int amdtp_stream_add_pcm_hw_constraints(struct amdtp_stream *s, * number equals to SYT_INTERVAL. So the number is 8, 16 or 32, * depending on its sampling rate. For accurate period interrupt, it's * preferrable to align period/buffer sizes to current SYT_INTERVAL. - * - * TODO: These constraints can be improved with proper rules. - * Currently apply LCM of SYT_INTERVALs. */ - err = snd_pcm_hw_constraint_step(runtime, 0, - SNDRV_PCM_HW_PARAM_PERIOD_SIZE, 32); + err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, + apply_constraint_to_size, NULL, + SNDRV_PCM_HW_PARAM_PERIOD_SIZE, + SNDRV_PCM_HW_PARAM_RATE, -1); + if (err < 0) + goto end; + err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, + apply_constraint_to_size, NULL, + SNDRV_PCM_HW_PARAM_BUFFER_SIZE, + SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto end; - err = snd_pcm_hw_constraint_step(runtime, 0, - SNDRV_PCM_HW_PARAM_BUFFER_SIZE, 32); end: return err; } diff --git a/sound/firewire/bebob/bebob.c b/sound/firewire/bebob/bebob.c index 93676354f87f..672d13488454 100644 --- a/sound/firewire/bebob/bebob.c +++ b/sound/firewire/bebob/bebob.c @@ -126,23 +126,6 @@ end: return err; } -static void bebob_free(struct snd_bebob *bebob) -{ - snd_bebob_stream_destroy_duplex(bebob); - fw_unit_put(bebob->unit); - - kfree(bebob->maudio_special_quirk); - - mutex_destroy(&bebob->mutex); - kfree(bebob); -} - -/* - * This module releases the FireWire unit data after all ALSA character devices - * are released by applications. This is for releasing stream data or finishing - * transactions safely. Thus at returning from .remove(), this module still keep - * references for the unit. - */ static void bebob_card_free(struct snd_card *card) { @@ -152,7 +135,7 @@ bebob_card_free(struct snd_card *card) clear_bit(bebob->card_index, devices_used); mutex_unlock(&devices_mutex); - bebob_free(card->private_data); + snd_bebob_stream_destroy_duplex(bebob); } static const struct snd_bebob_spec * @@ -192,7 +175,6 @@ do_registration(struct work_struct *work) return; mutex_lock(&devices_mutex); - for (card_index = 0; card_index < SNDRV_CARDS; card_index++) { if (!test_bit(card_index, devices_used) && enable[card_index]) break; @@ -208,6 +190,11 @@ do_registration(struct work_struct *work) mutex_unlock(&devices_mutex); return; } + set_bit(card_index, devices_used); + mutex_unlock(&devices_mutex); + + bebob->card->private_free = bebob_card_free; + bebob->card->private_data = bebob; err = name_device(bebob); if (err < 0) @@ -248,23 +235,10 @@ do_registration(struct work_struct *work) if (err < 0) goto error; - set_bit(card_index, devices_used); - mutex_unlock(&devices_mutex); - - /* - * After registered, bebob instance can be released corresponding to - * releasing the sound card instance. - */ - bebob->card->private_free = bebob_card_free; - bebob->card->private_data = bebob; bebob->registered = true; return; error: - mutex_unlock(&devices_mutex); - snd_bebob_stream_destroy_duplex(bebob); - kfree(bebob->maudio_special_quirk); - bebob->maudio_special_quirk = NULL; snd_card_free(bebob->card); dev_info(&bebob->unit->device, "Sound card registration failed: %d\n", err); @@ -295,15 +269,15 @@ bebob_probe(struct fw_unit *unit, const struct ieee1394_device_id *entry) } /* Allocate this independent of sound card instance. */ - bebob = kzalloc(sizeof(struct snd_bebob), GFP_KERNEL); - if (bebob == NULL) + bebob = devm_kzalloc(&unit->device, sizeof(struct snd_bebob), + GFP_KERNEL); + if (!bebob) return -ENOMEM; - bebob->unit = fw_unit_get(unit); - bebob->entry = entry; - bebob->spec = spec; dev_set_drvdata(&unit->device, bebob); + bebob->entry = entry; + bebob->spec = spec; mutex_init(&bebob->mutex); spin_lock_init(&bebob->lock); init_waitqueue_head(&bebob->hwdep_wait); @@ -379,12 +353,12 @@ static void bebob_remove(struct fw_unit *unit) cancel_delayed_work_sync(&bebob->dwork); if (bebob->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(bebob->card); - } else { - /* Don't forget this case. */ - bebob_free(bebob); + // Block till all of ALSA character devices are released. + snd_card_free(bebob->card); } + + mutex_destroy(&bebob->mutex); + fw_unit_put(bebob->unit); } static const struct snd_bebob_rate_spec normal_rate_spec = { diff --git a/sound/firewire/bebob/bebob_maudio.c b/sound/firewire/bebob/bebob_maudio.c index c266997ad299..51152ca4af57 100644 --- a/sound/firewire/bebob/bebob_maudio.c +++ b/sound/firewire/bebob/bebob_maudio.c @@ -261,8 +261,9 @@ snd_bebob_maudio_special_discover(struct snd_bebob *bebob, bool is1814) struct special_params *params; int err; - params = kzalloc(sizeof(struct special_params), GFP_KERNEL); - if (params == NULL) + params = devm_kzalloc(&bebob->card->card_dev, + sizeof(struct special_params), GFP_KERNEL); + if (!params) return -ENOMEM; mutex_lock(&bebob->mutex); diff --git a/sound/firewire/dice/dice.c b/sound/firewire/dice/dice.c index 774eb2205668..ed50b222d36e 100644 --- a/sound/firewire/dice/dice.c +++ b/sound/firewire/dice/dice.c @@ -122,25 +122,12 @@ static void dice_card_strings(struct snd_dice *dice) strcpy(card->mixername, "DICE"); } -static void dice_free(struct snd_dice *dice) +static void dice_card_free(struct snd_card *card) { + struct snd_dice *dice = card->private_data; + snd_dice_stream_destroy_duplex(dice); snd_dice_transaction_destroy(dice); - fw_unit_put(dice->unit); - - mutex_destroy(&dice->mutex); - kfree(dice); -} - -/* - * This module releases the FireWire unit data after all ALSA character devices - * are released by applications. This is for releasing stream data or finishing - * transactions safely. Thus at returning from .remove(), this module still keep - * references for the unit. - */ -static void dice_card_free(struct snd_card *card) -{ - dice_free(card->private_data); } static void do_registration(struct work_struct *work) @@ -155,6 +142,8 @@ static void do_registration(struct work_struct *work) &dice->card); if (err < 0) return; + dice->card->private_free = dice_card_free; + dice->card->private_data = dice; err = snd_dice_transaction_init(dice); if (err < 0) @@ -192,19 +181,10 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - /* - * After registered, dice instance can be released corresponding to - * releasing the sound card instance. - */ - dice->card->private_free = dice_card_free; - dice->card->private_data = dice; dice->registered = true; return; error: - snd_dice_stream_destroy_duplex(dice); - snd_dice_transaction_destroy(dice); - snd_dice_stream_destroy_duplex(dice); snd_card_free(dice->card); dev_info(&dice->unit->device, "Sound card registration failed: %d\n", err); @@ -223,10 +203,9 @@ static int dice_probe(struct fw_unit *unit, } /* Allocate this independent of sound card instance. */ - dice = kzalloc(sizeof(struct snd_dice), GFP_KERNEL); - if (dice == NULL) + dice = devm_kzalloc(&unit->device, sizeof(struct snd_dice), GFP_KERNEL); + if (!dice) return -ENOMEM; - dice->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, dice); @@ -261,12 +240,12 @@ static void dice_remove(struct fw_unit *unit) cancel_delayed_work_sync(&dice->dwork); if (dice->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(dice->card); - } else { - /* Don't forget this case. */ - dice_free(dice); + // Block till all of ALSA character devices are released. + snd_card_free(dice->card); } + + mutex_destroy(&dice->mutex); + fw_unit_put(dice->unit); } static void dice_bus_reset(struct fw_unit *unit) diff --git a/sound/firewire/digi00x/digi00x.c b/sound/firewire/digi00x/digi00x.c index ef689997d6a5..6c6ea149ef6b 100644 --- a/sound/firewire/digi00x/digi00x.c +++ b/sound/firewire/digi00x/digi00x.c @@ -41,20 +41,12 @@ static int name_card(struct snd_dg00x *dg00x) return 0; } -static void dg00x_free(struct snd_dg00x *dg00x) +static void dg00x_card_free(struct snd_card *card) { + struct snd_dg00x *dg00x = card->private_data; + snd_dg00x_stream_destroy_duplex(dg00x); snd_dg00x_transaction_unregister(dg00x); - - fw_unit_put(dg00x->unit); - - mutex_destroy(&dg00x->mutex); - kfree(dg00x); -} - -static void dg00x_card_free(struct snd_card *card) -{ - dg00x_free(card->private_data); } static void do_registration(struct work_struct *work) @@ -70,6 +62,8 @@ static void do_registration(struct work_struct *work) &dg00x->card); if (err < 0) return; + dg00x->card->private_free = dg00x_card_free; + dg00x->card->private_data = dg00x; err = name_card(dg00x); if (err < 0) @@ -101,14 +95,10 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - dg00x->card->private_free = dg00x_card_free; - dg00x->card->private_data = dg00x; dg00x->registered = true; return; error: - snd_dg00x_transaction_unregister(dg00x); - snd_dg00x_stream_destroy_duplex(dg00x); snd_card_free(dg00x->card); dev_info(&dg00x->unit->device, "Sound card registration failed: %d\n", err); @@ -120,8 +110,9 @@ static int snd_dg00x_probe(struct fw_unit *unit, struct snd_dg00x *dg00x; /* Allocate this independent of sound card instance. */ - dg00x = kzalloc(sizeof(struct snd_dg00x), GFP_KERNEL); - if (dg00x == NULL) + dg00x = devm_kzalloc(&unit->device, sizeof(struct snd_dg00x), + GFP_KERNEL); + if (!dg00x) return -ENOMEM; dg00x->unit = fw_unit_get(unit); @@ -173,12 +164,12 @@ static void snd_dg00x_remove(struct fw_unit *unit) cancel_delayed_work_sync(&dg00x->dwork); if (dg00x->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(dg00x->card); - } else { - /* Don't forget this case. */ - dg00x_free(dg00x); + // Block till all of ALSA character devices are released. + snd_card_free(dg00x->card); } + + mutex_destroy(&dg00x->mutex); + fw_unit_put(dg00x->unit); } static const struct ieee1394_device_id snd_dg00x_id_table[] = { diff --git a/sound/firewire/fireface/ff.c b/sound/firewire/fireface/ff.c index 4974bc7980e9..3f61cfeace69 100644 --- a/sound/firewire/fireface/ff.c +++ b/sound/firewire/fireface/ff.c @@ -27,20 +27,12 @@ static void name_card(struct snd_ff *ff) dev_name(&ff->unit->device), 100 << fw_dev->max_speed); } -static void ff_free(struct snd_ff *ff) +static void ff_card_free(struct snd_card *card) { + struct snd_ff *ff = card->private_data; + snd_ff_stream_destroy_duplex(ff); snd_ff_transaction_unregister(ff); - - fw_unit_put(ff->unit); - - mutex_destroy(&ff->mutex); - kfree(ff); -} - -static void ff_card_free(struct snd_card *card) -{ - ff_free(card->private_data); } static void do_registration(struct work_struct *work) @@ -55,6 +47,8 @@ static void do_registration(struct work_struct *work) &ff->card); if (err < 0) return; + ff->card->private_free = ff_card_free; + ff->card->private_data = ff; err = snd_ff_transaction_register(ff); if (err < 0) @@ -84,14 +78,10 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - ff->card->private_free = ff_card_free; - ff->card->private_data = ff; ff->registered = true; return; error: - snd_ff_transaction_unregister(ff); - snd_ff_stream_destroy_duplex(ff); snd_card_free(ff->card); dev_info(&ff->unit->device, "Sound card registration failed: %d\n", err); @@ -102,11 +92,9 @@ static int snd_ff_probe(struct fw_unit *unit, { struct snd_ff *ff; - ff = kzalloc(sizeof(struct snd_ff), GFP_KERNEL); - if (ff == NULL) + ff = devm_kzalloc(&unit->device, sizeof(struct snd_ff), GFP_KERNEL); + if (!ff) return -ENOMEM; - - /* initialize myself */ ff->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, ff); @@ -149,12 +137,12 @@ static void snd_ff_remove(struct fw_unit *unit) cancel_work_sync(&ff->dwork.work); if (ff->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(ff->card); - } else { - /* Don't forget this case. */ - ff_free(ff); + // Block till all of ALSA character devices are released. + snd_card_free(ff->card); } + + mutex_destroy(&ff->mutex); + fw_unit_put(ff->unit); } static const struct snd_ff_spec spec_ff400 = { diff --git a/sound/firewire/fireworks/fireworks.c b/sound/firewire/fireworks/fireworks.c index f2d073365cf6..faf0e001c4c5 100644 --- a/sound/firewire/fireworks/fireworks.c +++ b/sound/firewire/fireworks/fireworks.c @@ -184,36 +184,17 @@ end: return err; } -static void efw_free(struct snd_efw *efw) -{ - snd_efw_stream_destroy_duplex(efw); - snd_efw_transaction_remove_instance(efw); - fw_unit_put(efw->unit); - - kfree(efw->resp_buf); - - mutex_destroy(&efw->mutex); - kfree(efw); -} - -/* - * This module releases the FireWire unit data after all ALSA character devices - * are released by applications. This is for releasing stream data or finishing - * transactions safely. Thus at returning from .remove(), this module still keep - * references for the unit. - */ static void efw_card_free(struct snd_card *card) { struct snd_efw *efw = card->private_data; - if (efw->card_index >= 0) { - mutex_lock(&devices_mutex); - clear_bit(efw->card_index, devices_used); - mutex_unlock(&devices_mutex); - } + mutex_lock(&devices_mutex); + clear_bit(efw->card_index, devices_used); + mutex_unlock(&devices_mutex); - efw_free(card->private_data); + snd_efw_stream_destroy_duplex(efw); + snd_efw_transaction_remove_instance(efw); } static void @@ -226,9 +207,8 @@ do_registration(struct work_struct *work) if (efw->registered) return; - mutex_lock(&devices_mutex); - /* check registered cards */ + mutex_lock(&devices_mutex); for (card_index = 0; card_index < SNDRV_CARDS; ++card_index) { if (!test_bit(card_index, devices_used) && enable[card_index]) break; @@ -244,12 +224,18 @@ do_registration(struct work_struct *work) mutex_unlock(&devices_mutex); return; } + set_bit(card_index, devices_used); + mutex_unlock(&devices_mutex); + + efw->card->private_free = efw_card_free; + efw->card->private_data = efw; /* prepare response buffer */ snd_efw_resp_buf_size = clamp(snd_efw_resp_buf_size, SND_EFW_RESPONSE_MAXIMUM_BYTES, 4096U); - efw->resp_buf = kzalloc(snd_efw_resp_buf_size, GFP_KERNEL); - if (efw->resp_buf == NULL) { + efw->resp_buf = devm_kzalloc(&efw->card->card_dev, + snd_efw_resp_buf_size, GFP_KERNEL); + if (!efw->resp_buf) { err = -ENOMEM; goto error; } @@ -284,25 +270,11 @@ do_registration(struct work_struct *work) if (err < 0) goto error; - set_bit(card_index, devices_used); - mutex_unlock(&devices_mutex); - - /* - * After registered, efw instance can be released corresponding to - * releasing the sound card instance. - */ - efw->card->private_free = efw_card_free; - efw->card->private_data = efw; efw->registered = true; return; error: - mutex_unlock(&devices_mutex); - snd_efw_transaction_remove_instance(efw); - snd_efw_stream_destroy_duplex(efw); snd_card_free(efw->card); - kfree(efw->resp_buf); - efw->resp_buf = NULL; dev_info(&efw->unit->device, "Sound card registration failed: %d\n", err); } @@ -312,10 +284,9 @@ efw_probe(struct fw_unit *unit, const struct ieee1394_device_id *entry) { struct snd_efw *efw; - efw = kzalloc(sizeof(struct snd_efw), GFP_KERNEL); + efw = devm_kzalloc(&unit->device, sizeof(struct snd_efw), GFP_KERNEL); if (efw == NULL) return -ENOMEM; - efw->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, efw); @@ -363,12 +334,12 @@ static void efw_remove(struct fw_unit *unit) cancel_delayed_work_sync(&efw->dwork); if (efw->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(efw->card); - } else { - /* Don't forget this case. */ - efw_free(efw); + // Block till all of ALSA character devices are released. + snd_card_free(efw->card); } + + mutex_destroy(&efw->mutex); + fw_unit_put(efw->unit); } static const struct ieee1394_device_id efw_id_table[] = { diff --git a/sound/firewire/isight.c b/sound/firewire/isight.c index 30957477e005..9ebe510ea26b 100644 --- a/sound/firewire/isight.c +++ b/sound/firewire/isight.c @@ -602,8 +602,6 @@ static void isight_card_free(struct snd_card *card) struct isight *isight = card->private_data; fw_iso_resources_destroy(&isight->resources); - fw_unit_put(isight->unit); - mutex_destroy(&isight->mutex); } static u64 get_unit_base(struct fw_unit *unit) @@ -640,7 +638,7 @@ static int isight_probe(struct fw_unit *unit, if (!isight->audio_base) { dev_err(&unit->device, "audio unit base not found\n"); err = -ENXIO; - goto err_unit; + goto error; } fw_iso_resources_init(&isight->resources, unit); @@ -669,12 +667,12 @@ static int isight_probe(struct fw_unit *unit, dev_set_drvdata(&unit->device, isight); return 0; - -err_unit: - fw_unit_put(isight->unit); - mutex_destroy(&isight->mutex); error: snd_card_free(card); + + mutex_destroy(&isight->mutex); + fw_unit_put(isight->unit); + return err; } @@ -703,7 +701,11 @@ static void isight_remove(struct fw_unit *unit) isight_stop_streaming(isight); mutex_unlock(&isight->mutex); - snd_card_free_when_closed(isight->card); + // Block till all of ALSA character devices are released. + snd_card_free(isight->card); + + mutex_destroy(&isight->mutex); + fw_unit_put(isight->unit); } static const struct ieee1394_device_id isight_id_table[] = { diff --git a/sound/firewire/motu/motu.c b/sound/firewire/motu/motu.c index 300d31b6f191..220e61926ea4 100644 --- a/sound/firewire/motu/motu.c +++ b/sound/firewire/motu/motu.c @@ -52,26 +52,12 @@ static void name_card(struct snd_motu *motu) dev_name(&motu->unit->device), 100 << fw_dev->max_speed); } -static void motu_free(struct snd_motu *motu) +static void motu_card_free(struct snd_card *card) { - snd_motu_transaction_unregister(motu); + struct snd_motu *motu = card->private_data; + snd_motu_transaction_unregister(motu); snd_motu_stream_destroy_duplex(motu); - fw_unit_put(motu->unit); - - mutex_destroy(&motu->mutex); - kfree(motu); -} - -/* - * This module releases the FireWire unit data after all ALSA character devices - * are released by applications. This is for releasing stream data or finishing - * transactions safely. Thus at returning from .remove(), this module still keep - * references for the unit. - */ -static void motu_card_free(struct snd_card *card) -{ - motu_free(card->private_data); } static void do_registration(struct work_struct *work) @@ -86,6 +72,8 @@ static void do_registration(struct work_struct *work) &motu->card); if (err < 0) return; + motu->card->private_free = motu_card_free; + motu->card->private_data = motu; name_card(motu); @@ -120,18 +108,10 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - /* - * After registered, motu instance can be released corresponding to - * releasing the sound card instance. - */ - motu->card->private_free = motu_card_free; - motu->card->private_data = motu; motu->registered = true; return; error: - snd_motu_transaction_unregister(motu); - snd_motu_stream_destroy_duplex(motu); snd_card_free(motu->card); dev_info(&motu->unit->device, "Sound card registration failed: %d\n", err); @@ -143,14 +123,13 @@ static int motu_probe(struct fw_unit *unit, struct snd_motu *motu; /* Allocate this independently of sound card instance. */ - motu = kzalloc(sizeof(struct snd_motu), GFP_KERNEL); - if (motu == NULL) + motu = devm_kzalloc(&unit->device, sizeof(struct snd_motu), GFP_KERNEL); + if (!motu) return -ENOMEM; - - motu->spec = (const struct snd_motu_spec *)entry->driver_data; motu->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, motu); + motu->spec = (const struct snd_motu_spec *)entry->driver_data; mutex_init(&motu->mutex); spin_lock_init(&motu->lock); init_waitqueue_head(&motu->hwdep_wait); @@ -174,12 +153,12 @@ static void motu_remove(struct fw_unit *unit) cancel_delayed_work_sync(&motu->dwork); if (motu->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(motu->card); - } else { - /* Don't forget this case. */ - motu_free(motu); + // Block till all of ALSA character devices are released. + snd_card_free(motu->card); } + + mutex_destroy(&motu->mutex); + fw_unit_put(motu->unit); } static void motu_bus_update(struct fw_unit *unit) diff --git a/sound/firewire/oxfw/oxfw-scs1x.c b/sound/firewire/oxfw/oxfw-scs1x.c index f33497cdc706..9d9545880a28 100644 --- a/sound/firewire/oxfw/oxfw-scs1x.c +++ b/sound/firewire/oxfw/oxfw-scs1x.c @@ -372,8 +372,9 @@ int snd_oxfw_scs1x_add(struct snd_oxfw *oxfw) struct fw_scs1x *scs; int err; - scs = kzalloc(sizeof(struct fw_scs1x), GFP_KERNEL); - if (scs == NULL) + scs = devm_kzalloc(&oxfw->card->card_dev, sizeof(struct fw_scs1x), + GFP_KERNEL); + if (!scs) return -ENOMEM; scs->fw_dev = fw_parent_device(oxfw->unit); oxfw->spec = scs; diff --git a/sound/firewire/oxfw/oxfw-spkr.c b/sound/firewire/oxfw/oxfw-spkr.c index cb905af0660d..66d4b1f73f0f 100644 --- a/sound/firewire/oxfw/oxfw-spkr.c +++ b/sound/firewire/oxfw/oxfw-spkr.c @@ -270,8 +270,9 @@ int snd_oxfw_add_spkr(struct snd_oxfw *oxfw, bool is_lacie) unsigned int i, first_ch; int err; - spkr = kzalloc(sizeof(struct fw_spkr), GFP_KERNEL); - if (spkr == NULL) + spkr = devm_kzalloc(&oxfw->card->card_dev, sizeof(struct fw_spkr), + GFP_KERNEL); + if (!spkr) return -ENOMEM; oxfw->spec = spkr; diff --git a/sound/firewire/oxfw/oxfw-stream.c b/sound/firewire/oxfw/oxfw-stream.c index d9361f352133..f230a9e44c3c 100644 --- a/sound/firewire/oxfw/oxfw-stream.c +++ b/sound/firewire/oxfw/oxfw-stream.c @@ -517,8 +517,9 @@ assume_stream_formats(struct snd_oxfw *oxfw, enum avc_general_plug_dir dir, if (err < 0) goto end; - formats[eid] = kmemdup(buf, *len, GFP_KERNEL); - if (formats[eid] == NULL) { + formats[eid] = devm_kmemdup(&oxfw->card->card_dev, buf, *len, + GFP_KERNEL); + if (!formats[eid]) { err = -ENOMEM; goto end; } @@ -535,7 +536,8 @@ assume_stream_formats(struct snd_oxfw *oxfw, enum avc_general_plug_dir dir, continue; eid++; - formats[eid] = kmemdup(buf, *len, GFP_KERNEL); + formats[eid] = devm_kmemdup(&oxfw->card->card_dev, buf, *len, + GFP_KERNEL); if (formats[eid] == NULL) { err = -ENOMEM; goto end; @@ -597,8 +599,9 @@ static int fill_stream_formats(struct snd_oxfw *oxfw, if (err < 0) break; - formats[eid] = kmemdup(buf, len, GFP_KERNEL); - if (formats[eid] == NULL) { + formats[eid] = devm_kmemdup(&oxfw->card->card_dev, buf, len, + GFP_KERNEL); + if (!formats[eid]) { err = -ENOMEM; break; } diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index 2ea8be6c8584..afb78d90384b 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -113,35 +113,13 @@ end: return err; } -static void oxfw_free(struct snd_oxfw *oxfw) +static void oxfw_card_free(struct snd_card *card) { - unsigned int i; + struct snd_oxfw *oxfw = card->private_data; snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->rx_stream); if (oxfw->has_output) snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->tx_stream); - - fw_unit_put(oxfw->unit); - - for (i = 0; i < SND_OXFW_STREAM_FORMAT_ENTRIES; i++) { - kfree(oxfw->tx_stream_formats[i]); - kfree(oxfw->rx_stream_formats[i]); - } - - kfree(oxfw->spec); - mutex_destroy(&oxfw->mutex); - kfree(oxfw); -} - -/* - * This module releases the FireWire unit data after all ALSA character devices - * are released by applications. This is for releasing stream data or finishing - * transactions safely. Thus at returning from .remove(), this module still keep - * references for the unit. - */ -static void oxfw_card_free(struct snd_card *card) -{ - oxfw_free(card->private_data); } static int detect_quirks(struct snd_oxfw *oxfw) @@ -208,7 +186,6 @@ static int detect_quirks(struct snd_oxfw *oxfw) static void do_registration(struct work_struct *work) { struct snd_oxfw *oxfw = container_of(work, struct snd_oxfw, dwork.work); - int i; int err; if (oxfw->registered) @@ -218,6 +195,8 @@ static void do_registration(struct work_struct *work) &oxfw->card); if (err < 0) return; + oxfw->card->private_free = oxfw_card_free; + oxfw->card->private_data = oxfw; err = name_card(oxfw); if (err < 0) @@ -258,28 +237,11 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - /* - * After registered, oxfw instance can be released corresponding to - * releasing the sound card instance. - */ - oxfw->card->private_free = oxfw_card_free; - oxfw->card->private_data = oxfw; oxfw->registered = true; return; error: - snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->rx_stream); - if (oxfw->has_output) - snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->tx_stream); - for (i = 0; i < SND_OXFW_STREAM_FORMAT_ENTRIES; ++i) { - kfree(oxfw->tx_stream_formats[i]); - oxfw->tx_stream_formats[i] = NULL; - kfree(oxfw->rx_stream_formats[i]); - oxfw->rx_stream_formats[i] = NULL; - } snd_card_free(oxfw->card); - kfree(oxfw->spec); - oxfw->spec = NULL; dev_info(&oxfw->unit->device, "Sound card registration failed: %d\n", err); } @@ -293,14 +255,13 @@ static int oxfw_probe(struct fw_unit *unit, return -ENODEV; /* Allocate this independent of sound card instance. */ - oxfw = kzalloc(sizeof(struct snd_oxfw), GFP_KERNEL); - if (oxfw == NULL) + oxfw = devm_kzalloc(&unit->device, sizeof(struct snd_oxfw), GFP_KERNEL); + if (!oxfw) return -ENOMEM; - - oxfw->entry = entry; oxfw->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, oxfw); + oxfw->entry = entry; mutex_init(&oxfw->mutex); spin_lock_init(&oxfw->lock); init_waitqueue_head(&oxfw->hwdep_wait); @@ -347,12 +308,12 @@ static void oxfw_remove(struct fw_unit *unit) cancel_delayed_work_sync(&oxfw->dwork); if (oxfw->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(oxfw->card); - } else { - /* Don't forget this case. */ - oxfw_free(oxfw); + // Block till all of ALSA character devices are released. + snd_card_free(oxfw->card); } + + mutex_destroy(&oxfw->mutex); + fw_unit_put(oxfw->unit); } static const struct compat_info griffin_firewave = { diff --git a/sound/firewire/tascam/tascam.c b/sound/firewire/tascam/tascam.c index d3fdc463a884..ef57fa4db323 100644 --- a/sound/firewire/tascam/tascam.c +++ b/sound/firewire/tascam/tascam.c @@ -85,20 +85,12 @@ static int identify_model(struct snd_tscm *tscm) return 0; } -static void tscm_free(struct snd_tscm *tscm) +static void tscm_card_free(struct snd_card *card) { + struct snd_tscm *tscm = card->private_data; + snd_tscm_transaction_unregister(tscm); snd_tscm_stream_destroy_duplex(tscm); - - fw_unit_put(tscm->unit); - - mutex_destroy(&tscm->mutex); - kfree(tscm); -} - -static void tscm_card_free(struct snd_card *card) -{ - tscm_free(card->private_data); } static void do_registration(struct work_struct *work) @@ -110,6 +102,8 @@ static void do_registration(struct work_struct *work) &tscm->card); if (err < 0) return; + tscm->card->private_free = tscm_card_free; + tscm->card->private_data = tscm; err = identify_model(tscm); if (err < 0) @@ -141,18 +135,10 @@ static void do_registration(struct work_struct *work) if (err < 0) goto error; - /* - * After registered, tscm instance can be released corresponding to - * releasing the sound card instance. - */ - tscm->card->private_free = tscm_card_free; - tscm->card->private_data = tscm; tscm->registered = true; return; error: - snd_tscm_transaction_unregister(tscm); - snd_tscm_stream_destroy_duplex(tscm); snd_card_free(tscm->card); dev_info(&tscm->unit->device, "Sound card registration failed: %d\n", err); @@ -164,11 +150,9 @@ static int snd_tscm_probe(struct fw_unit *unit, struct snd_tscm *tscm; /* Allocate this independent of sound card instance. */ - tscm = kzalloc(sizeof(struct snd_tscm), GFP_KERNEL); - if (tscm == NULL) + tscm = devm_kzalloc(&unit->device, sizeof(struct snd_tscm), GFP_KERNEL); + if (!tscm) return -ENOMEM; - - /* initialize myself */ tscm->unit = fw_unit_get(unit); dev_set_drvdata(&unit->device, tscm); @@ -216,12 +200,12 @@ static void snd_tscm_remove(struct fw_unit *unit) cancel_delayed_work_sync(&tscm->dwork); if (tscm->registered) { - /* No need to wait for releasing card object in this context. */ - snd_card_free_when_closed(tscm->card); - } else { - /* Don't forget this case. */ - tscm_free(tscm); + // Block till all of ALSA character devices are released. + snd_card_free(tscm->card); } + + mutex_destroy(&tscm->mutex); + fw_unit_put(tscm->unit); } static const struct ieee1394_device_id snd_tscm_id_table[] = { diff --git a/sound/hda/ext/hdac_ext_controller.c b/sound/hda/ext/hdac_ext_controller.c index 5bc4a1d587d4..60cb00fd0c69 100644 --- a/sound/hda/ext/hdac_ext_controller.c +++ b/sound/hda/ext/hdac_ext_controller.c @@ -48,9 +48,11 @@ void snd_hdac_ext_bus_ppcap_enable(struct hdac_bus *bus, bool enable) } if (enable) - snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, 0, AZX_PPCTL_GPROCEN); + snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, + AZX_PPCTL_GPROCEN, AZX_PPCTL_GPROCEN); else - snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, AZX_PPCTL_GPROCEN, 0); + snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, + AZX_PPCTL_GPROCEN, 0); } EXPORT_SYMBOL_GPL(snd_hdac_ext_bus_ppcap_enable); @@ -68,9 +70,11 @@ void snd_hdac_ext_bus_ppcap_int_enable(struct hdac_bus *bus, bool enable) } if (enable) - snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, 0, AZX_PPCTL_PIE); + snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, + AZX_PPCTL_PIE, AZX_PPCTL_PIE); else - snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, AZX_PPCTL_PIE, 0); + snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, + AZX_PPCTL_PIE, 0); } EXPORT_SYMBOL_GPL(snd_hdac_ext_bus_ppcap_int_enable); @@ -194,7 +198,8 @@ static int check_hdac_link_power_active(struct hdac_ext_link *link, bool enable) */ int snd_hdac_ext_bus_link_power_up(struct hdac_ext_link *link) { - snd_hdac_updatel(link->ml_addr, AZX_REG_ML_LCTL, 0, AZX_MLCTL_SPA); + snd_hdac_updatel(link->ml_addr, AZX_REG_ML_LCTL, + AZX_MLCTL_SPA, AZX_MLCTL_SPA); return check_hdac_link_power_active(link, true); } @@ -222,8 +227,8 @@ int snd_hdac_ext_bus_link_power_up_all(struct hdac_bus *bus) int ret; list_for_each_entry(hlink, &bus->hlink_list, list) { - snd_hdac_updatel(hlink->ml_addr, - AZX_REG_ML_LCTL, 0, AZX_MLCTL_SPA); + snd_hdac_updatel(hlink->ml_addr, AZX_REG_ML_LCTL, + AZX_MLCTL_SPA, AZX_MLCTL_SPA); ret = check_hdac_link_power_active(hlink, true); if (ret < 0) return ret; @@ -243,7 +248,8 @@ int snd_hdac_ext_bus_link_power_down_all(struct hdac_bus *bus) int ret; list_for_each_entry(hlink, &bus->hlink_list, list) { - snd_hdac_updatel(hlink->ml_addr, AZX_REG_ML_LCTL, AZX_MLCTL_SPA, 0); + snd_hdac_updatel(hlink->ml_addr, AZX_REG_ML_LCTL, + AZX_MLCTL_SPA, 0); ret = check_hdac_link_power_active(hlink, false); if (ret < 0) return ret; diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index b5282cbbe489..617ff1aa818f 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -145,9 +145,11 @@ int snd_hdac_i915_init(struct hdac_bus *bus) if (!acomp->ops) { request_module("i915"); /* 10s timeout */ - wait_for_completion_timeout(&bind_complete, 10 * 1000); + wait_for_completion_timeout(&bind_complete, + msecs_to_jiffies(10 * 1000)); } if (!acomp->ops) { + dev_info(bus->dev, "couldn't bind with audio component\n"); snd_hdac_acomp_exit(bus); return -ENODEV; } diff --git a/sound/hda/hdac_regmap.c b/sound/hda/hdac_regmap.c index 419e285e0226..996dbc850224 100644 --- a/sound/hda/hdac_regmap.c +++ b/sound/hda/hdac_regmap.c @@ -359,7 +359,8 @@ static const struct regmap_config hda_regmap_cfg = { .cache_type = REGCACHE_RBTREE, .reg_read = hda_reg_read, .reg_write = hda_reg_write, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; /** diff --git a/sound/i2c/cs8427.c b/sound/i2c/cs8427.c index 2647309bc675..8afa2f888466 100644 --- a/sound/i2c/cs8427.c +++ b/sound/i2c/cs8427.c @@ -118,7 +118,7 @@ static int snd_cs8427_send_corudata(struct snd_i2c_device *device, struct cs8427 *chip = device->private_data; char *hw_data = udata ? chip->playback.hw_udata : chip->playback.hw_status; - char data[32]; + unsigned char data[32]; int err, idx; if (!memcmp(hw_data, ndata, count)) diff --git a/sound/isa/opti9xx/opti92x-ad1848.c b/sound/isa/opti9xx/opti92x-ad1848.c index ac0ab6eb40f0..47e0b2820ace 100644 --- a/sound/isa/opti9xx/opti92x-ad1848.c +++ b/sound/isa/opti9xx/opti92x-ad1848.c @@ -389,7 +389,8 @@ static int snd_opti9xx_configure(struct snd_opti9xx *chip, case OPTi9XX_HW_82C931: /* disable 3D sound (set GPIO1 as output, low) */ snd_opti9xx_write_mask(chip, OPTi9XX_MC_REG(20), 0x04, 0x0c); - case OPTi9XX_HW_82C933: /* FALL THROUGH */ + /* fall through */ + case OPTi9XX_HW_82C933: /* * The BTC 1817DW has QS1000 wavetable which is connected * to the serial digital input of the OPTI931. @@ -400,7 +401,8 @@ static int snd_opti9xx_configure(struct snd_opti9xx *chip, * or digital input signal. */ snd_opti9xx_write_mask(chip, OPTi9XX_MC_REG(26), 0x01, 0x01); - case OPTi9XX_HW_82C930: /* FALL THROUGH */ + /* fall through */ + case OPTi9XX_HW_82C930: snd_opti9xx_write_mask(chip, OPTi9XX_MC_REG(6), 0x02, 0x03); snd_opti9xx_write_mask(chip, OPTi9XX_MC_REG(3), 0x00, 0xff); snd_opti9xx_write_mask(chip, OPTi9XX_MC_REG(4), 0x10 | diff --git a/sound/isa/sb/sb8_main.c b/sound/isa/sb/sb8_main.c index 481797744b3c..8288fae90085 100644 --- a/sound/isa/sb/sb8_main.c +++ b/sound/isa/sb/sb8_main.c @@ -130,13 +130,13 @@ static int snd_sb8_playback_prepare(struct snd_pcm_substream *substream) chip->playback_format = SB_DSP_HI_OUTPUT_AUTO; break; } - /* fallthru */ + /* fall through */ case SB_HW_201: if (rate > 23000) { chip->playback_format = SB_DSP_HI_OUTPUT_AUTO; break; } - /* fallthru */ + /* fall through */ case SB_HW_20: chip->playback_format = SB_DSP_LO_OUTPUT_AUTO; break; @@ -287,7 +287,7 @@ static int snd_sb8_capture_prepare(struct snd_pcm_substream *substream) chip->capture_format = SB_DSP_HI_INPUT_AUTO; break; } - /* fallthru */ + /* fall through */ case SB_HW_20: chip->capture_format = SB_DSP_LO_INPUT_AUTO; break; @@ -387,7 +387,7 @@ irqreturn_t snd_sb8dsp_interrupt(struct snd_sb *chip) case SB_MODE_PLAYBACK_16: /* ok.. playback is active */ if (chip->hardware != SB_HW_JAZZ16) break; - /* fallthru */ + /* fall through */ case SB_MODE_PLAYBACK_8: substream = chip->playback_substream; if (chip->playback_format == SB_DSP_OUTPUT) @@ -397,7 +397,7 @@ irqreturn_t snd_sb8dsp_interrupt(struct snd_sb *chip) case SB_MODE_CAPTURE_16: if (chip->hardware != SB_HW_JAZZ16) break; - /* fallthru */ + /* fall through */ case SB_MODE_CAPTURE_8: substream = chip->capture_substream; if (chip->capture_format == SB_DSP_INPUT) diff --git a/sound/mips/hal2.c b/sound/mips/hal2.c index c8904e732aaa..a4ed54aeaf1d 100644 --- a/sound/mips/hal2.c +++ b/sound/mips/hal2.c @@ -500,7 +500,8 @@ static const struct snd_pcm_hardware hal2_pcm_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_INTERLEAVED | - SNDRV_PCM_INFO_BLOCK_TRANSFER), + SNDRV_PCM_INFO_BLOCK_TRANSFER | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = SNDRV_PCM_FMTBIT_S16_BE, .rates = SNDRV_PCM_RATE_8000_48000, .rate_min = 8000, @@ -563,6 +564,8 @@ static int hal2_playback_prepare(struct snd_pcm_substream *substream) dac->sample_rate = hal2_compute_rate(dac, runtime->rate); memset(&dac->pcm_indirect, 0, sizeof(dac->pcm_indirect)); dac->pcm_indirect.hw_buffer_size = H2_BUF_SIZE; + dac->pcm_indirect.hw_queue_size = H2_BUF_SIZE / 2; + dac->pcm_indirect.hw_io = dac->buffer_dma; dac->pcm_indirect.sw_buffer_size = snd_pcm_lib_buffer_bytes(substream); dac->substream = substream; hal2_setup_dac(hal2); @@ -575,9 +578,6 @@ static int hal2_playback_trigger(struct snd_pcm_substream *substream, int cmd) switch (cmd) { case SNDRV_PCM_TRIGGER_START: - hal2->dac.pcm_indirect.hw_io = hal2->dac.buffer_dma; - hal2->dac.pcm_indirect.hw_data = 0; - substream->ops->ack(substream); hal2_start_dac(hal2); break; case SNDRV_PCM_TRIGGER_STOP: @@ -615,7 +615,6 @@ static int hal2_playback_ack(struct snd_pcm_substream *substream) struct snd_hal2 *hal2 = snd_pcm_substream_chip(substream); struct hal2_codec *dac = &hal2->dac; - dac->pcm_indirect.hw_queue_size = H2_BUF_SIZE / 2; return snd_pcm_indirect_playback_transfer(substream, &dac->pcm_indirect, hal2_playback_transfer); @@ -655,6 +654,7 @@ static int hal2_capture_prepare(struct snd_pcm_substream *substream) memset(&adc->pcm_indirect, 0, sizeof(adc->pcm_indirect)); adc->pcm_indirect.hw_buffer_size = H2_BUF_SIZE; adc->pcm_indirect.hw_queue_size = H2_BUF_SIZE / 2; + adc->pcm_indirect.hw_io = adc->buffer_dma; adc->pcm_indirect.sw_buffer_size = snd_pcm_lib_buffer_bytes(substream); adc->substream = substream; hal2_setup_adc(hal2); @@ -667,9 +667,6 @@ static int hal2_capture_trigger(struct snd_pcm_substream *substream, int cmd) switch (cmd) { case SNDRV_PCM_TRIGGER_START: - hal2->adc.pcm_indirect.hw_io = hal2->adc.buffer_dma; - hal2->adc.pcm_indirect.hw_data = 0; - printk(KERN_DEBUG "buffer_dma %x\n", hal2->adc.buffer_dma); hal2_start_adc(hal2); break; case SNDRV_PCM_TRIGGER_STOP: diff --git a/sound/pci/asihpi/hpios.c b/sound/pci/asihpi/hpios.c index 5ef4fe964366..7c91330af719 100644 --- a/sound/pci/asihpi/hpios.c +++ b/sound/pci/asihpi/hpios.c @@ -49,7 +49,7 @@ u16 hpios_locked_mem_alloc(struct consistent_dma_area *p_mem_area, u32 size, /*?? any benefit in using managed dmam_alloc_coherent? */ p_mem_area->vaddr = dma_alloc_coherent(&pdev->dev, size, &p_mem_area->dma_handle, - GFP_DMA32 | GFP_KERNEL); + GFP_KERNEL); if (p_mem_area->vaddr) { HPI_DEBUG_LOG(DEBUG, "allocated %d bytes, dma 0x%x vma %p\n", diff --git a/sound/pci/atiixp.c b/sound/pci/atiixp.c index a1e4944dcfe8..1a41f8c80243 100644 --- a/sound/pci/atiixp.c +++ b/sound/pci/atiixp.c @@ -903,15 +903,15 @@ static int snd_atiixp_playback_prepare(struct snd_pcm_substream *substream) case 8: data |= ATI_REG_OUT_DMA_SLOT_BIT(10) | ATI_REG_OUT_DMA_SLOT_BIT(11); - /* fallthru */ + /* fall through */ case 6: data |= ATI_REG_OUT_DMA_SLOT_BIT(7) | ATI_REG_OUT_DMA_SLOT_BIT(8); - /* fallthru */ + /* fall through */ case 4: data |= ATI_REG_OUT_DMA_SLOT_BIT(6) | ATI_REG_OUT_DMA_SLOT_BIT(9); - /* fallthru */ + /* fall through */ default: data |= ATI_REG_OUT_DMA_SLOT_BIT(3) | ATI_REG_OUT_DMA_SLOT_BIT(4); diff --git a/sound/pci/au88x0/au88x0_core.c b/sound/pci/au88x0/au88x0_core.c index 2e5b460a847c..96ece1a71cf1 100644 --- a/sound/pci/au88x0/au88x0_core.c +++ b/sound/pci/au88x0/au88x0_core.c @@ -1115,6 +1115,7 @@ vortex_adbdma_setbuffers(vortex_t * vortex, int adbdma, hwwrite(vortex->mmio, VORTEX_ADBDMA_BUFBASE + (adbdma << 4) + 0xc, snd_pcm_sgbuf_get_addr(dma->substream, psize * 3)); + /* fall through */ /* 3 pages */ case 3: dma->cfg0 |= 0x12000000; @@ -1122,12 +1123,14 @@ vortex_adbdma_setbuffers(vortex_t * vortex, int adbdma, hwwrite(vortex->mmio, VORTEX_ADBDMA_BUFBASE + (adbdma << 4) + 0x8, snd_pcm_sgbuf_get_addr(dma->substream, psize * 2)); + /* fall through */ /* 2 pages */ case 2: dma->cfg0 |= 0x88000000 | 0x44000000 | 0x10000000 | (psize - 1); hwwrite(vortex->mmio, VORTEX_ADBDMA_BUFBASE + (adbdma << 4) + 0x4, snd_pcm_sgbuf_get_addr(dma->substream, psize)); + /* fall through */ /* 1 page */ case 1: dma->cfg0 |= 0x80000000 | 0x40000000 | ((psize - 1) << 0xc); @@ -1390,17 +1393,20 @@ vortex_wtdma_setbuffers(vortex_t * vortex, int wtdma, dma->cfg1 |= 0x88000000 | 0x44000000 | 0x30000000 | (psize-1); hwwrite(vortex->mmio, VORTEX_WTDMA_BUFBASE + (wtdma << 4) + 0xc, snd_pcm_sgbuf_get_addr(dma->substream, psize * 3)); + /* fall through */ /* 3 pages */ case 3: dma->cfg0 |= 0x12000000; dma->cfg1 |= 0x80000000 | 0x40000000 | ((psize-1) << 0xc); hwwrite(vortex->mmio, VORTEX_WTDMA_BUFBASE + (wtdma << 4) + 0x8, snd_pcm_sgbuf_get_addr(dma->substream, psize * 2)); + /* fall through */ /* 2 pages */ case 2: dma->cfg0 |= 0x88000000 | 0x44000000 | 0x10000000 | (psize-1); hwwrite(vortex->mmio, VORTEX_WTDMA_BUFBASE + (wtdma << 4) + 0x4, snd_pcm_sgbuf_get_addr(dma->substream, psize)); + /* fall through */ /* 1 page */ case 1: dma->cfg0 |= 0x80000000 | 0x40000000 | ((psize-1) << 0xc); diff --git a/sound/pci/ca0106/ca0106.h b/sound/pci/ca0106/ca0106.h index 04402c14cb23..9847b669cf3c 100644 --- a/sound/pci/ca0106/ca0106.h +++ b/sound/pci/ca0106/ca0106.h @@ -582,7 +582,7 @@ #define SPI_PL_BIT_R_R (2<<7) /* right channel = right */ #define SPI_PL_BIT_R_C (3<<7) /* right channel = (L+R)/2 */ #define SPI_IZD_REG 2 -#define SPI_IZD_BIT (1<<4) /* infinite zero detect */ +#define SPI_IZD_BIT (0<<4) /* infinite zero detect */ #define SPI_FMT_REG 3 #define SPI_FMT_BIT_RJ (0<<0) /* right justified mode */ diff --git a/sound/pci/cs46xx/cs46xx_lib.c b/sound/pci/cs46xx/cs46xx_lib.c index 146e1a3498c7..750eec437a79 100644 --- a/sound/pci/cs46xx/cs46xx_lib.c +++ b/sound/pci/cs46xx/cs46xx_lib.c @@ -1443,7 +1443,8 @@ static const struct snd_pcm_hardware snd_cs46xx_playback = .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER /*|*/ - /*SNDRV_PCM_INFO_RESUME*/), + /*SNDRV_PCM_INFO_RESUME*/ | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = (SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_U16_LE | SNDRV_PCM_FMTBIT_U16_BE), @@ -1465,7 +1466,8 @@ static const struct snd_pcm_hardware snd_cs46xx_capture = .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER /*|*/ - /*SNDRV_PCM_INFO_RESUME*/), + /*SNDRV_PCM_INFO_RESUME*/ | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = SNDRV_PCM_FMTBIT_S16_LE, .rates = SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_48000, .rate_min = 5500, diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c index 9f2b6097f486..30b3472d0b75 100644 --- a/sound/pci/emu10k1/emupcm.c +++ b/sound/pci/emu10k1/emupcm.c @@ -1753,7 +1753,8 @@ static const struct snd_pcm_hardware snd_emu10k1_fx8010_playback = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_RESUME | - /* SNDRV_PCM_INFO_MMAP_VALID | */ SNDRV_PCM_INFO_PAUSE), + /* SNDRV_PCM_INFO_MMAP_VALID | */ SNDRV_PCM_INFO_PAUSE | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S16_LE, .rates = SNDRV_PCM_RATE_48000, .rate_min = 48000, diff --git a/sound/pci/hda/hda_auto_parser.c b/sound/pci/hda/hda_auto_parser.c index b9a6b66aeb0e..df0d636145f8 100644 --- a/sound/pci/hda/hda_auto_parser.c +++ b/sound/pci/hda/hda_auto_parser.c @@ -13,7 +13,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" diff --git a/sound/pci/hda/hda_beep.h b/sound/pci/hda/hda_beep.h index d1a6a9c1329a..f1457c6b3969 100644 --- a/sound/pci/hda/hda_beep.h +++ b/sound/pci/hda/hda_beep.h @@ -9,7 +9,7 @@ #ifndef __SOUND_HDA_BEEP_H #define __SOUND_HDA_BEEP_H -#include "hda_codec.h" +#include #define HDA_BEEP_MODE_OFF 0 #define HDA_BEEP_MODE_ON 1 diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index d361bb77ca00..9174f1b3a987 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -11,7 +11,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" /* @@ -81,6 +81,12 @@ static int hda_codec_driver_probe(struct device *dev) hda_codec_patch_t patch; int err; + if (codec->bus->core.ext_ops) { + if (WARN_ON(!codec->bus->core.ext_ops->hdev_attach)) + return -EINVAL; + return codec->bus->core.ext_ops->hdev_attach(&codec->core); + } + if (WARN_ON(!codec->preset)) return -EINVAL; @@ -134,6 +140,12 @@ static int hda_codec_driver_remove(struct device *dev) { struct hda_codec *codec = dev_to_hda_codec(dev); + if (codec->bus->core.ext_ops) { + if (WARN_ON(!codec->bus->core.ext_ops->hdev_detach)) + return -EINVAL; + return codec->bus->core.ext_ops->hdev_detach(&codec->core); + } + if (codec->patch_ops.free) codec->patch_ops.free(codec); snd_hda_codec_cleanup_for_unbind(codec); diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 26d348b47867..0957813939e5 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -27,7 +27,7 @@ #include #include #include -#include "hda_codec.h" +#include #include #include #include diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c index a12e594d4e3b..fe2506672a72 100644 --- a/sound/pci/hda/hda_controller.c +++ b/sound/pci/hda/hda_controller.c @@ -130,8 +130,9 @@ static int azx_pcm_hw_params(struct snd_pcm_substream *substream, azx_dev->core.bufsize = 0; azx_dev->core.period_bytes = 0; azx_dev->core.format_val = 0; - ret = chip->ops->substream_alloc_pages(chip, substream, - params_buffer_bytes(hw_params)); + ret = snd_pcm_lib_malloc_pages(substream, + params_buffer_bytes(hw_params)); + unlock: dsp_unlock(azx_dev); return ret; @@ -141,7 +142,6 @@ static int azx_pcm_hw_free(struct snd_pcm_substream *substream) { struct azx_pcm *apcm = snd_pcm_substream_chip(substream); struct azx_dev *azx_dev = get_azx_dev(substream); - struct azx *chip = apcm->chip; struct hda_pcm_stream *hinfo = to_hda_pcm_stream(substream); int err; @@ -152,7 +152,7 @@ static int azx_pcm_hw_free(struct snd_pcm_substream *substream) snd_hda_codec_cleanup(apcm->codec, hinfo, substream); - err = chip->ops->substream_free_pages(chip, substream); + err = snd_pcm_lib_free_pages(substream); azx_stream(azx_dev)->prepared = 0; dsp_unlock(azx_dev); return err; @@ -732,6 +732,7 @@ int snd_hda_attach_pcm_stream(struct hda_bus *_bus, struct hda_codec *codec, int pcm_dev = cpcm->device; unsigned int size; int s, err; + int type = SNDRV_DMA_TYPE_DEV_SG; list_for_each_entry(apcm, &chip->pcm_list, list) { if (apcm->pcm->device == pcm_dev) { @@ -770,7 +771,9 @@ int snd_hda_attach_pcm_stream(struct hda_bus *_bus, struct hda_codec *codec, size = CONFIG_SND_HDA_PREALLOC_SIZE * 1024; if (size > MAX_PREALLOC_SIZE) size = MAX_PREALLOC_SIZE; - snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV_SG, + if (chip->uc_buffer) + type = SNDRV_DMA_TYPE_DEV_UC_SG; + snd_pcm_lib_preallocate_pages_for_all(pcm, type, chip->card->dev, size, MAX_PREALLOC_SIZE); return 0; @@ -1220,27 +1223,6 @@ void snd_hda_bus_reset(struct hda_bus *bus) bus->in_reset = 0; } -static int get_jackpoll_interval(struct azx *chip) -{ - int i; - unsigned int j; - - if (!chip->jackpoll_ms) - return 0; - - i = chip->jackpoll_ms[chip->dev_index]; - if (i == 0) - return 0; - if (i < 50 || i > 60000) - j = 0; - else - j = msecs_to_jiffies(i); - if (j == 0) - dev_warn(chip->card->dev, - "jackpoll_ms value out of range: %d\n", i); - return j; -} - /* HD-audio bus initialization */ int azx_bus_init(struct azx *chip, const char *model, const struct hdac_io_ops *io_ops) @@ -1323,7 +1305,7 @@ int azx_probe_codecs(struct azx *chip, unsigned int max_slots) err = snd_hda_codec_new(&chip->bus, chip->card, c, &codec); if (err < 0) continue; - codec->jackpoll_interval = get_jackpoll_interval(chip); + codec->jackpoll_interval = chip->jackpoll_interval; codec->beep_mode = chip->beep_mode; codecs++; } diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h index a68e75b00ea3..c95097bb5a0c 100644 --- a/sound/pci/hda/hda_controller.h +++ b/sound/pci/hda/hda_controller.h @@ -20,7 +20,7 @@ #include #include #include -#include "hda_codec.h" +#include #include #define AZX_MAX_CODECS HDA_MAX_CODECS @@ -76,7 +76,6 @@ struct azx_dev { * when link position is not greater than FIFO size */ unsigned int insufficient:1; - unsigned int wc_marked:1; }; #define azx_stream(dev) (&(dev)->core) @@ -88,11 +87,6 @@ struct azx; struct hda_controller_ops { /* Disable msi if supported, PCI only */ int (*disable_msi_reset_irq)(struct azx *); - int (*substream_alloc_pages)(struct azx *chip, - struct snd_pcm_substream *substream, - size_t size); - int (*substream_free_pages)(struct azx *chip, - struct snd_pcm_substream *substream); void (*pcm_mmap_prepare)(struct snd_pcm_substream *substream, struct vm_area_struct *area); /* Check if current position is acceptable */ @@ -127,7 +121,7 @@ struct azx { int capture_streams; int capture_index_offset; int num_streams; - const int *jackpoll_ms; /* per-card jack poll interval */ + int jackpoll_interval; /* jack poll interval in jiffies */ /* Register interaction. */ const struct hda_controller_ops *ops; @@ -160,6 +154,7 @@ struct azx { unsigned int msi:1; unsigned int probing:1; /* codec probing phase */ unsigned int snoop:1; + unsigned int uc_buffer:1; /* non-cached pages for stream buffers */ unsigned int align_buffer_size:1; unsigned int region_requested:1; unsigned int disabled:1; /* disabled by vga_switcheroo */ @@ -175,11 +170,10 @@ struct azx { #define azx_bus(chip) (&(chip)->bus.core) #define bus_to_azx(_bus) container_of(_bus, struct azx, bus.core) -#ifdef CONFIG_X86 -#define azx_snoop(chip) ((chip)->snoop) -#else -#define azx_snoop(chip) true -#endif +static inline bool azx_snoop(struct azx *chip) +{ + return !IS_ENABLED(CONFIG_X86) || chip->snoop; +} /* * macros for easy use diff --git a/sound/pci/hda/hda_eld.c b/sound/pci/hda/hda_eld.c index ba7fe9b6655c..806b12ed44a2 100644 --- a/sound/pci/hda/hda_eld.c +++ b/sound/pci/hda/hda_eld.c @@ -27,7 +27,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" enum eld_versions { diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 579984ecdec3..276150f29cda 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -32,7 +32,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/hda_hwdep.c b/sound/pci/hda/hda_hwdep.c index cc009a4a3d1d..268bba6ec985 100644 --- a/sound/pci/hda/hda_hwdep.c +++ b/sound/pci/hda/hda_hwdep.c @@ -23,7 +23,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include #include diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index aa4c672dbaf7..d8eb2b5f51ae 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -63,7 +63,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_controller.h" #include "hda_intel.h" @@ -399,61 +399,6 @@ static char *driver_short_names[] = { [AZX_DRIVER_GENERIC] = "HD-Audio Generic", }; -#ifdef CONFIG_X86 -static void __mark_pages_wc(struct azx *chip, struct snd_dma_buffer *dmab, bool on) -{ - int pages; - - if (azx_snoop(chip)) - return; - if (!dmab || !dmab->area || !dmab->bytes) - return; - -#ifdef CONFIG_SND_DMA_SGBUF - if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_SG) { - struct snd_sg_buf *sgbuf = dmab->private_data; - if (chip->driver_type == AZX_DRIVER_CMEDIA) - return; /* deal with only CORB/RIRB buffers */ - if (on) - set_pages_array_wc(sgbuf->page_table, sgbuf->pages); - else - set_pages_array_wb(sgbuf->page_table, sgbuf->pages); - return; - } -#endif - - pages = (dmab->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (on) - set_memory_wc((unsigned long)dmab->area, pages); - else - set_memory_wb((unsigned long)dmab->area, pages); -} - -static inline void mark_pages_wc(struct azx *chip, struct snd_dma_buffer *buf, - bool on) -{ - __mark_pages_wc(chip, buf, on); -} -static inline void mark_runtime_wc(struct azx *chip, struct azx_dev *azx_dev, - struct snd_pcm_substream *substream, bool on) -{ - if (azx_dev->wc_marked != on) { - __mark_pages_wc(chip, snd_pcm_get_dma_buf(substream), on); - azx_dev->wc_marked = on; - } -} -#else -/* NOP for other archs */ -static inline void mark_pages_wc(struct azx *chip, struct snd_dma_buffer *buf, - bool on) -{ -} -static inline void mark_runtime_wc(struct azx *chip, struct azx_dev *azx_dev, - struct snd_pcm_substream *substream, bool on) -{ -} -#endif - static int azx_acquire_irq(struct azx *chip, int do_disconnect); static void set_default_power_save(struct azx *chip); @@ -1678,6 +1623,7 @@ static void azx_check_snoop_available(struct azx *chip) dev_info(chip->card->dev, "Force to %s mode by module option\n", snoop ? "snoop" : "non-snoop"); chip->snoop = snoop; + chip->uc_buffer = !snoop; return; } @@ -1698,8 +1644,12 @@ static void azx_check_snoop_available(struct azx *chip) snoop = false; chip->snoop = snoop; - if (!snoop) + if (!snoop) { dev_info(chip->card->dev, "Force to non-snoop mode\n"); + /* C-Media requires non-cached pages only for CORB/RIRB */ + if (chip->driver_type != AZX_DRIVER_CMEDIA) + chip->uc_buffer = true; + } } static void azx_probe_work(struct work_struct *work) @@ -1767,7 +1717,8 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, chip->driver_type = driver_caps & 0xff; check_msi(chip); chip->dev_index = dev; - chip->jackpoll_ms = jackpoll_ms; + if (jackpoll_ms[dev] >= 50 && jackpoll_ms[dev] <= 60000) + chip->jackpoll_interval = msecs_to_jiffies(jackpoll_ms[dev]); INIT_LIST_HEAD(&chip->pcm_list); INIT_WORK(&hda->irq_pending_work, azx_irq_pending_work); INIT_LIST_HEAD(&hda->list); @@ -2090,55 +2041,24 @@ static int dma_alloc_pages(struct hdac_bus *bus, struct snd_dma_buffer *buf) { struct azx *chip = bus_to_azx(bus); - int err; - err = snd_dma_alloc_pages(type, - bus->dev, - size, buf); - if (err < 0) - return err; - mark_pages_wc(chip, buf, true); - return 0; + if (!azx_snoop(chip) && type == SNDRV_DMA_TYPE_DEV) + type = SNDRV_DMA_TYPE_DEV_UC; + return snd_dma_alloc_pages(type, bus->dev, size, buf); } static void dma_free_pages(struct hdac_bus *bus, struct snd_dma_buffer *buf) { - struct azx *chip = bus_to_azx(bus); - - mark_pages_wc(chip, buf, false); snd_dma_free_pages(buf); } -static int substream_alloc_pages(struct azx *chip, - struct snd_pcm_substream *substream, - size_t size) -{ - struct azx_dev *azx_dev = get_azx_dev(substream); - int ret; - - mark_runtime_wc(chip, azx_dev, substream, false); - ret = snd_pcm_lib_malloc_pages(substream, size); - if (ret < 0) - return ret; - mark_runtime_wc(chip, azx_dev, substream, true); - return 0; -} - -static int substream_free_pages(struct azx *chip, - struct snd_pcm_substream *substream) -{ - struct azx_dev *azx_dev = get_azx_dev(substream); - mark_runtime_wc(chip, azx_dev, substream, false); - return snd_pcm_lib_free_pages(substream); -} - static void pcm_mmap_prepare(struct snd_pcm_substream *substream, struct vm_area_struct *area) { #ifdef CONFIG_X86 struct azx_pcm *apcm = snd_pcm_substream_chip(substream); struct azx *chip = apcm->chip; - if (!azx_snoop(chip) && chip->driver_type != AZX_DRIVER_CMEDIA) + if (chip->uc_buffer) area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); #endif } @@ -2156,8 +2076,6 @@ static const struct hdac_io_ops pci_hda_io_ops = { static const struct hda_controller_ops pci_hda_ops = { .disable_msi_reset_irq = disable_msi_reset_irq, - .substream_alloc_pages = substream_alloc_pages, - .substream_free_pages = substream_free_pages, .pcm_mmap_prepare = pcm_mmap_prepare, .position_check = azx_position_check, .link_power = azx_intel_link_power, @@ -2257,8 +2175,12 @@ static struct snd_pci_quirk power_save_blacklist[] = { /* https://bugzilla.redhat.com/show_bug.cgi?id=1581607 */ SND_PCI_QUIRK(0x1558, 0x3501, "Clevo W35xSS_370SS", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ + SND_PCI_QUIRK(0x1028, 0x0497, "Dell Precision T3600", 0), + /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ /* Note the P55A-UD3 and Z87-D3HP share the subsys id for the HDA dev */ SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte P55A-UD3 / Z87-D3HP", 0), + /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ + SND_PCI_QUIRK(0x8086, 0x2040, "Intel DZ77BH-55K", 0), /* https://bugzilla.kernel.org/show_bug.cgi?id=199607 */ SND_PCI_QUIRK(0x8086, 0x2057, "Intel NUC5i7RYB", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1520902 */ diff --git a/sound/pci/hda/hda_jack.c b/sound/pci/hda/hda_jack.c index a33234e04d4f..c499727920e6 100644 --- a/sound/pci/hda/hda_jack.c +++ b/sound/pci/hda/hda_jack.c @@ -15,7 +15,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/hda_proc.c b/sound/pci/hda/hda_proc.c index c6b778b2580c..a65740419650 100644 --- a/sound/pci/hda/hda_proc.c +++ b/sound/pci/hda/hda_proc.c @@ -25,7 +25,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" static int dump_coef = -1; diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c index 6ec79c58d48d..c154b19a0c45 100644 --- a/sound/pci/hda/hda_sysfs.c +++ b/sound/pci/hda/hda_sysfs.c @@ -14,7 +14,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include #include diff --git a/sound/pci/hda/hda_tegra.c b/sound/pci/hda/hda_tegra.c index 0621920f7617..dd7d4242d6d2 100644 --- a/sound/pci/hda/hda_tegra.c +++ b/sound/pci/hda/hda_tegra.c @@ -35,7 +35,7 @@ #include #include -#include "hda_codec.h" +#include #include "hda_controller.h" /* Defines for Nvidia Tegra HDA support */ @@ -99,19 +99,6 @@ static void dma_free_pages(struct hdac_bus *bus, struct snd_dma_buffer *buf) snd_dma_free_pages(buf); } -static int substream_alloc_pages(struct azx *chip, - struct snd_pcm_substream *substream, - size_t size) -{ - return snd_pcm_lib_malloc_pages(substream, size); -} - -static int substream_free_pages(struct azx *chip, - struct snd_pcm_substream *substream) -{ - return snd_pcm_lib_free_pages(substream); -} - /* * Register access ops. Tegra HDA register access is DWORD only. */ @@ -180,10 +167,7 @@ static const struct hdac_io_ops hda_tegra_io_ops = { .dma_free_pages = dma_free_pages, }; -static const struct hda_controller_ops hda_tegra_ops = { - .substream_alloc_pages = substream_alloc_pages, - .substream_free_pages = substream_free_pages, -}; +static const struct hda_controller_ops hda_tegra_ops; /* nothing special */ static void hda_tegra_init(struct hda_tegra *hda) { diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c index fd476fb40e1b..ebfd0be885b3 100644 --- a/sound/pci/hda/patch_analog.c +++ b/sound/pci/hda/patch_analog.c @@ -24,7 +24,7 @@ #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_beep.h" diff --git a/sound/pci/hda/patch_ca0110.c b/sound/pci/hda/patch_ca0110.c index c2d9ee9cfdc0..21d0f0610913 100644 --- a/sound/pci/hda/patch_ca0110.c +++ b/sound/pci/hda/patch_ca0110.c @@ -22,7 +22,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index 0166a3d7cd55..0a24037184c3 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -31,8 +31,9 @@ #include #include #include +#include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" @@ -81,12 +82,12 @@ #define SCP_GET 1 #define EFX_FILE "ctefx.bin" -#define SBZ_EFX_FILE "ctefx-sbz.bin" +#define DESKTOP_EFX_FILE "ctefx-desktop.bin" #define R3DI_EFX_FILE "ctefx-r3di.bin" #ifdef CONFIG_SND_HDA_CODEC_CA0132_DSP MODULE_FIRMWARE(EFX_FILE); -MODULE_FIRMWARE(SBZ_EFX_FILE); +MODULE_FIRMWARE(DESKTOP_EFX_FILE); MODULE_FIRMWARE(R3DI_EFX_FILE); #endif @@ -152,7 +153,10 @@ enum { XBASS_XOVER, EQ_PRESET_ENUM, SMART_VOLUME_ENUM, - MIC_BOOST_ENUM + MIC_BOOST_ENUM, + AE5_HEADPHONE_GAIN_ENUM, + AE5_SOUND_FILTER_ENUM, + ZXR_HEADPHONE_GAIN #define EFFECTS_COUNT (EFFECT_END_NID - EFFECT_START_NID) }; @@ -666,6 +670,65 @@ static const struct ct_dsp_volume_ctl ca0132_alt_vol_ctls[] = { } }; +/* Values for ca0113_mmio_command_set for selecting output. */ +#define AE5_CA0113_OUT_SET_COMMANDS 6 +struct ae5_ca0113_output_set { + unsigned int group[AE5_CA0113_OUT_SET_COMMANDS]; + unsigned int target[AE5_CA0113_OUT_SET_COMMANDS]; + unsigned int vals[AE5_CA0113_OUT_SET_COMMANDS]; +}; + +static const struct ae5_ca0113_output_set ae5_ca0113_output_presets[] = { + { .group = { 0x30, 0x30, 0x48, 0x48, 0x48, 0x30 }, + .target = { 0x2e, 0x30, 0x0d, 0x17, 0x19, 0x32 }, + .vals = { 0x00, 0x00, 0x40, 0x00, 0x00, 0x3f } + }, + { .group = { 0x30, 0x30, 0x48, 0x48, 0x48, 0x30 }, + .target = { 0x2e, 0x30, 0x0d, 0x17, 0x19, 0x32 }, + .vals = { 0x3f, 0x3f, 0x00, 0x00, 0x00, 0x00 } + }, + { .group = { 0x30, 0x30, 0x48, 0x48, 0x48, 0x30 }, + .target = { 0x2e, 0x30, 0x0d, 0x17, 0x19, 0x32 }, + .vals = { 0x00, 0x00, 0x40, 0x00, 0x00, 0x3f } + } +}; + +/* ae5 ca0113 command sequences to set headphone gain levels. */ +#define AE5_HEADPHONE_GAIN_PRESET_MAX_COMMANDS 4 +struct ae5_headphone_gain_set { + char *name; + unsigned int vals[AE5_HEADPHONE_GAIN_PRESET_MAX_COMMANDS]; +}; + +static const struct ae5_headphone_gain_set ae5_headphone_gain_presets[] = { + { .name = "Low (16-31", + .vals = { 0xff, 0x2c, 0xf5, 0x32 } + }, + { .name = "Medium (32-149", + .vals = { 0x38, 0xa8, 0x3e, 0x4c } + }, + { .name = "High (150-600", + .vals = { 0xff, 0xff, 0xff, 0x7f } + } +}; + +struct ae5_filter_set { + char *name; + unsigned int val; +}; + +static const struct ae5_filter_set ae5_filter_presets[] = { + { .name = "Slow Roll Off", + .val = 0xa0 + }, + { .name = "Minimum Phase", + .val = 0xc0 + }, + { .name = "Fast Roll Off", + .val = 0x80 + } +}; + enum hda_cmd_vendor_io { /* for DspIO node */ VENDOR_DSPIO_SCP_WRITE_DATA_LOW = 0x000, @@ -685,6 +748,9 @@ enum hda_cmd_vendor_io { VENDOR_CHIPIO_DATA_LOW = 0x300, VENDOR_CHIPIO_DATA_HIGH = 0x400, + VENDOR_CHIPIO_8051_WRITE_DIRECT = 0x500, + VENDOR_CHIPIO_8051_READ_DIRECT = 0xD00, + VENDOR_CHIPIO_GET_PARAMETER = 0xF00, VENDOR_CHIPIO_STATUS = 0xF01, VENDOR_CHIPIO_HIC_POST_READ = 0x702, @@ -692,6 +758,9 @@ enum hda_cmd_vendor_io { VENDOR_CHIPIO_8051_DATA_WRITE = 0x707, VENDOR_CHIPIO_8051_DATA_READ = 0xF07, + VENDOR_CHIPIO_8051_PMEM_READ = 0xF08, + VENDOR_CHIPIO_8051_IRAM_WRITE = 0x709, + VENDOR_CHIPIO_8051_IRAM_READ = 0xF09, VENDOR_CHIPIO_CT_EXTENSIONS_ENABLE = 0x70A, VENDOR_CHIPIO_CT_EXTENSIONS_GET = 0xF0A, @@ -798,6 +867,12 @@ enum control_param_id { * impedance is selected*/ CONTROL_PARAM_PORTD_160OHM_GAIN = 10, + /* + * This control param name was found in the 8051 memory, and makes + * sense given the fact the AE-5 uses it and has the ASI flag set. + */ + CONTROL_PARAM_ASI = 23, + /* Stream Control */ /* Select stream with the given ID */ @@ -955,7 +1030,11 @@ struct ca0132_spec { long eq_preset_val; unsigned int tlv[4]; struct hda_vmaster_mute_hook vmaster_mute; - + /* AE-5 Control values */ + unsigned char ae5_headphone_gain_val; + unsigned char ae5_filter_val; + /* ZxR Control Values */ + unsigned char zxr_gain_set; struct hda_codec *codec; struct delayed_work unsol_hp_work; @@ -995,8 +1074,11 @@ enum { QUIRK_ALIENWARE, QUIRK_ALIENWARE_M17XR4, QUIRK_SBZ, + QUIRK_ZXR, + QUIRK_ZXR_DBPRO, QUIRK_R3DI, QUIRK_R3D, + QUIRK_AE5, }; static const struct hda_pintbl alienware_pincfgs[] = { @@ -1028,6 +1110,21 @@ static const struct hda_pintbl sbz_pincfgs[] = { {} }; +/* Sound Blaster ZxR pin configs taken from Windows Driver */ +static const struct hda_pintbl zxr_pincfgs[] = { + { 0x0b, 0x01047110 }, /* Port G -- Lineout FRONT L/R */ + { 0x0c, 0x414510f0 }, /* SPDIF Out 1 - Disabled*/ + { 0x0d, 0x014510f0 }, /* Digital Out */ + { 0x0e, 0x41c520f0 }, /* SPDIF In - Disabled*/ + { 0x0f, 0x0122711f }, /* Port A -- BackPanel HP */ + { 0x10, 0x01017111 }, /* Port D -- Center/LFE */ + { 0x11, 0x01017114 }, /* Port B -- LineMicIn2 / Rear L/R */ + { 0x12, 0x01a271f0 }, /* Port C -- LineIn1 */ + { 0x13, 0x908700f0 }, /* What U Hear In*/ + { 0x18, 0x50d000f0 }, /* N/A */ + {} +}; + /* Recon3D pin configs taken from Windows Driver */ static const struct hda_pintbl r3d_pincfgs[] = { { 0x0b, 0x01014110 }, /* Port G -- Lineout FRONT L/R */ @@ -1043,6 +1140,21 @@ static const struct hda_pintbl r3d_pincfgs[] = { {} }; +/* Sound Blaster AE-5 pin configs taken from Windows Driver */ +static const struct hda_pintbl ae5_pincfgs[] = { + { 0x0b, 0x01017010 }, /* Port G -- Lineout FRONT L/R */ + { 0x0c, 0x014510f0 }, /* SPDIF Out 1 */ + { 0x0d, 0x014510f0 }, /* Digital Out */ + { 0x0e, 0x01c510f0 }, /* SPDIF In */ + { 0x0f, 0x01017114 }, /* Port A -- Rear L/R. */ + { 0x10, 0x01017012 }, /* Port D -- Center/LFE or FP Hp */ + { 0x11, 0x01a170ff }, /* Port B -- LineMicIn2 / Rear Headphone */ + { 0x12, 0x01a170f0 }, /* Port C -- LineIn1 */ + { 0x13, 0x908700f0 }, /* What U Hear In*/ + { 0x18, 0x50d000f0 }, /* N/A */ + {} +}; + /* Recon3D integrated pin configs taken from Windows Driver */ static const struct hda_pintbl r3di_pincfgs[] = { { 0x0b, 0x01014110 }, /* Port G -- Lineout FRONT L/R */ @@ -1069,6 +1181,7 @@ static const struct snd_pci_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1458, 0xA026, "Gigabyte G1.Sniper Z97", QUIRK_R3DI), SND_PCI_QUIRK(0x1458, 0xA036, "Gigabyte GA-Z170X-Gaming 7", QUIRK_R3DI), SND_PCI_QUIRK(0x1102, 0x0013, "Recon3D", QUIRK_R3D), + SND_PCI_QUIRK(0x1102, 0x0051, "Sound Blaster AE-5", QUIRK_AE5), {} }; @@ -1453,6 +1566,20 @@ static void chipio_set_conn_rate(struct hda_codec *codec, rate); } +/* + * Writes to the 8051's internal address space directly instead of indirectly, + * giving access to the special function registers located at addresses + * 0x80-0xFF. + */ +static void chipio_8051_write_direct(struct hda_codec *codec, + unsigned int addr, unsigned int data) +{ + unsigned int verb; + + verb = VENDOR_CHIPIO_8051_WRITE_DIRECT | data; + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, verb, addr); +} + /* * Enable clocks. */ @@ -3088,7 +3215,9 @@ static bool dspload_wait_loaded(struct hda_codec *codec) } /* - * Setup GPIO for the other variants of Core3D. + * ca0113 related functions. The ca0113 acts as the HDA bus for the pci-e + * based cards, and has a second mmio region, region2, that's used for special + * commands. */ /* @@ -3096,8 +3225,11 @@ static bool dspload_wait_loaded(struct hda_codec *codec) * the mmio address 0x320 is used to set GPIO pins. The format for the data * The first eight bits are just the number of the pin. So far, I've only seen * this number go to 7. + * AE-5 note: The AE-5 seems to use pins 2 and 3 to somehow set the color value + * of the on-card LED. It seems to use pin 2 for data, then toggles 3 to on and + * then off to send that bit. */ -static void ca0132_mmio_gpio_set(struct hda_codec *codec, unsigned int gpio_pin, +static void ca0113_mmio_gpio_set(struct hda_codec *codec, unsigned int gpio_pin, bool enable) { struct ca0132_spec *spec = codec->spec; @@ -3109,6 +3241,89 @@ static void ca0132_mmio_gpio_set(struct hda_codec *codec, unsigned int gpio_pin, writew(gpio_data, spec->mem_base + 0x320); } +/* + * Special pci region2 commands that are only used by the AE-5. They follow + * a set format, and require reads at certain points to seemingly 'clear' + * the response data. My first tests didn't do these reads, and would cause + * the card to get locked up until the memory was read. These commands + * seem to work with three distinct values that I've taken to calling group, + * target-id, and value. + */ +static void ca0113_mmio_command_set(struct hda_codec *codec, unsigned int group, + unsigned int target, unsigned int value) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int write_val; + + writel(0x0000007e, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + writel(0x0000005a, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + + writel(0x00800005, spec->mem_base + 0x20c); + writel(group, spec->mem_base + 0x804); + + writel(0x00800005, spec->mem_base + 0x20c); + write_val = (target & 0xff); + write_val |= (value << 8); + + + writel(write_val, spec->mem_base + 0x204); + /* + * Need delay here or else it goes too fast and works inconsistently. + */ + msleep(20); + + readl(spec->mem_base + 0x860); + readl(spec->mem_base + 0x854); + readl(spec->mem_base + 0x840); + + writel(0x00800004, spec->mem_base + 0x20c); + writel(0x00000000, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); +} + +/* + * This second type of command is used for setting the sound filter type. + */ +static void ca0113_mmio_command_set_type2(struct hda_codec *codec, + unsigned int group, unsigned int target, unsigned int value) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int write_val; + + writel(0x0000007e, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + writel(0x0000005a, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + + writel(0x00800003, spec->mem_base + 0x20c); + writel(group, spec->mem_base + 0x804); + + writel(0x00800005, spec->mem_base + 0x20c); + write_val = (target & 0xff); + write_val |= (value << 8); + + + writel(write_val, spec->mem_base + 0x204); + msleep(20); + readl(spec->mem_base + 0x860); + readl(spec->mem_base + 0x854); + readl(spec->mem_base + 0x840); + + writel(0x00800004, spec->mem_base + 0x20c); + writel(0x00000000, spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); + readl(spec->mem_base + 0x210); +} + +/* + * Setup GPIO for the other variants of Core3D. + */ + /* * Sets up the GPIO pins so that they are discoverable. If this isn't done, * the card shows as having no GPIO pins. @@ -3119,6 +3334,7 @@ static void ca0132_gpio_init(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_SBZ: + case QUIRK_AE5: snd_hda_codec_write(codec, 0x01, 0, 0x793, 0x00); snd_hda_codec_write(codec, 0x01, 0, 0x794, 0x53); snd_hda_codec_write(codec, 0x01, 0, 0x790, 0x23); @@ -3928,6 +4144,138 @@ exit: return err < 0 ? err : 0; } +static int ae5_headphone_gain_set(struct hda_codec *codec, long val); +static int zxr_headphone_gain_set(struct hda_codec *codec, long val); +static int ca0132_effects_set(struct hda_codec *codec, hda_nid_t nid, long val); + +static void ae5_mmio_select_out(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int i; + + for (i = 0; i < AE5_CA0113_OUT_SET_COMMANDS; i++) + ca0113_mmio_command_set(codec, + ae5_ca0113_output_presets[spec->cur_out_type].group[i], + ae5_ca0113_output_presets[spec->cur_out_type].target[i], + ae5_ca0113_output_presets[spec->cur_out_type].vals[i]); +} + +/* + * These are the commands needed to setup output on each of the different card + * types. + */ +static void ca0132_alt_select_out_quirk_handler(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int tmp; + + switch (spec->cur_out_type) { + case SPEAKER_OUT: + switch (spec->quirk) { + case QUIRK_SBZ: + ca0113_mmio_gpio_set(codec, 7, false); + ca0113_mmio_gpio_set(codec, 4, true); + ca0113_mmio_gpio_set(codec, 1, true); + chipio_set_control_param(codec, 0x0d, 0x18); + break; + case QUIRK_ZXR: + ca0113_mmio_gpio_set(codec, 2, true); + ca0113_mmio_gpio_set(codec, 3, true); + ca0113_mmio_gpio_set(codec, 5, false); + zxr_headphone_gain_set(codec, 0); + chipio_set_control_param(codec, 0x0d, 0x24); + break; + case QUIRK_R3DI: + chipio_set_control_param(codec, 0x0d, 0x24); + r3di_gpio_out_set(codec, R3DI_LINE_OUT); + break; + case QUIRK_R3D: + chipio_set_control_param(codec, 0x0d, 0x24); + ca0113_mmio_gpio_set(codec, 1, true); + break; + case QUIRK_AE5: + ae5_mmio_select_out(codec); + ae5_headphone_gain_set(codec, 2); + tmp = FLOAT_ZERO; + dspio_set_uint_param(codec, 0x96, 0x29, tmp); + dspio_set_uint_param(codec, 0x96, 0x2a, tmp); + chipio_set_control_param(codec, 0x0d, 0xa4); + chipio_write(codec, 0x18b03c, 0x00000012); + break; + } + break; + case HEADPHONE_OUT: + switch (spec->quirk) { + case QUIRK_SBZ: + ca0113_mmio_gpio_set(codec, 7, true); + ca0113_mmio_gpio_set(codec, 4, true); + ca0113_mmio_gpio_set(codec, 1, false); + chipio_set_control_param(codec, 0x0d, 0x12); + break; + case QUIRK_ZXR: + ca0113_mmio_gpio_set(codec, 2, false); + ca0113_mmio_gpio_set(codec, 3, false); + ca0113_mmio_gpio_set(codec, 5, true); + zxr_headphone_gain_set(codec, spec->zxr_gain_set); + chipio_set_control_param(codec, 0x0d, 0x21); + break; + case QUIRK_R3DI: + chipio_set_control_param(codec, 0x0d, 0x21); + r3di_gpio_out_set(codec, R3DI_HEADPHONE_OUT); + break; + case QUIRK_R3D: + chipio_set_control_param(codec, 0x0d, 0x21); + ca0113_mmio_gpio_set(codec, 0x1, false); + break; + case QUIRK_AE5: + ae5_mmio_select_out(codec); + ae5_headphone_gain_set(codec, + spec->ae5_headphone_gain_val); + tmp = FLOAT_ONE; + dspio_set_uint_param(codec, 0x96, 0x29, tmp); + dspio_set_uint_param(codec, 0x96, 0x2a, tmp); + chipio_set_control_param(codec, 0x0d, 0xa1); + chipio_write(codec, 0x18b03c, 0x00000012); + break; + } + break; + case SURROUND_OUT: + switch (spec->quirk) { + case QUIRK_SBZ: + ca0113_mmio_gpio_set(codec, 7, false); + ca0113_mmio_gpio_set(codec, 4, true); + ca0113_mmio_gpio_set(codec, 1, true); + chipio_set_control_param(codec, 0x0d, 0x18); + break; + case QUIRK_ZXR: + ca0113_mmio_gpio_set(codec, 2, true); + ca0113_mmio_gpio_set(codec, 3, true); + ca0113_mmio_gpio_set(codec, 5, false); + zxr_headphone_gain_set(codec, 0); + chipio_set_control_param(codec, 0x0d, 0x24); + break; + case QUIRK_R3DI: + chipio_set_control_param(codec, 0x0d, 0x24); + r3di_gpio_out_set(codec, R3DI_LINE_OUT); + break; + case QUIRK_R3D: + ca0113_mmio_gpio_set(codec, 1, true); + chipio_set_control_param(codec, 0x0d, 0x24); + break; + case QUIRK_AE5: + ae5_mmio_select_out(codec); + ae5_headphone_gain_set(codec, 2); + tmp = FLOAT_ZERO; + dspio_set_uint_param(codec, 0x96, 0x29, tmp); + dspio_set_uint_param(codec, 0x96, 0x2a, tmp); + chipio_set_control_param(codec, 0x0d, 0xa4); + chipio_write(codec, 0x18b03c, 0x00000012); + break; + } + break; + } +} + /* * This function behaves similarly to the ca0132_select_out funciton above, * except with a few differences. It adds the ability to select the current @@ -3978,26 +4326,11 @@ static int ca0132_alt_select_out(struct hda_codec *codec) if (err < 0) goto exit; + ca0132_alt_select_out_quirk_handler(codec); + switch (spec->cur_out_type) { case SPEAKER_OUT: codec_dbg(codec, "%s speaker\n", __func__); - /*speaker out config*/ - switch (spec->quirk) { - case QUIRK_SBZ: - ca0132_mmio_gpio_set(codec, 7, false); - ca0132_mmio_gpio_set(codec, 4, true); - ca0132_mmio_gpio_set(codec, 1, true); - chipio_set_control_param(codec, 0x0D, 0x18); - break; - case QUIRK_R3DI: - chipio_set_control_param(codec, 0x0D, 0x24); - r3di_gpio_out_set(codec, R3DI_LINE_OUT); - break; - case QUIRK_R3D: - chipio_set_control_param(codec, 0x0D, 0x24); - ca0132_mmio_gpio_set(codec, 1, true); - break; - } /* disable headphone node */ pin_ctl = snd_hda_codec_read(codec, spec->out_pins[1], 0, @@ -4021,23 +4354,6 @@ static int ca0132_alt_select_out(struct hda_codec *codec) break; case HEADPHONE_OUT: codec_dbg(codec, "%s hp\n", __func__); - /* Headphone out config*/ - switch (spec->quirk) { - case QUIRK_SBZ: - ca0132_mmio_gpio_set(codec, 7, true); - ca0132_mmio_gpio_set(codec, 4, true); - ca0132_mmio_gpio_set(codec, 1, false); - chipio_set_control_param(codec, 0x0D, 0x12); - break; - case QUIRK_R3DI: - chipio_set_control_param(codec, 0x0D, 0x21); - r3di_gpio_out_set(codec, R3DI_HEADPHONE_OUT); - break; - case QUIRK_R3D: - chipio_set_control_param(codec, 0x0D, 0x21); - ca0132_mmio_gpio_set(codec, 0x1, false); - break; - } snd_hda_codec_write(codec, spec->out_pins[0], 0, AC_VERB_SET_EAPD_BTLENABLE, 0x00); @@ -4067,23 +4383,7 @@ static int ca0132_alt_select_out(struct hda_codec *codec) break; case SURROUND_OUT: codec_dbg(codec, "%s surround\n", __func__); - /* Surround out config*/ - switch (spec->quirk) { - case QUIRK_SBZ: - ca0132_mmio_gpio_set(codec, 7, false); - ca0132_mmio_gpio_set(codec, 4, true); - ca0132_mmio_gpio_set(codec, 1, true); - chipio_set_control_param(codec, 0x0D, 0x18); - break; - case QUIRK_R3DI: - chipio_set_control_param(codec, 0x0D, 0x24); - r3di_gpio_out_set(codec, R3DI_LINE_OUT); - break; - case QUIRK_R3D: - ca0132_mmio_gpio_set(codec, 1, true); - chipio_set_control_param(codec, 0x0D, 0x24); - break; - } + /* enable line out node */ pin_ctl = snd_hda_codec_read(codec, spec->out_pins[0], 0, AC_VERB_GET_PIN_WIDGET_CONTROL, 0); @@ -4108,14 +4408,21 @@ static int ca0132_alt_select_out(struct hda_codec *codec) snd_hda_set_pin_ctl(codec, spec->out_pins[3], pin_ctl | PIN_OUT); - if (spec->effects_switch[PLAY_ENHANCEMENT - EFFECT_START_NID]) - dspio_set_uint_param(codec, 0x80, 0x04, FLOAT_ONE); - else - dspio_set_uint_param(codec, 0x80, 0x04, FLOAT_EIGHT); + dspio_set_uint_param(codec, 0x80, 0x04, FLOAT_EIGHT); break; } + /* + * Surround always sets it's scp command to req 0x04 to FLOAT_EIGHT. + * With this set though, X_BASS cannot be enabled. So, if we have OutFX + * enabled, we need to make sure X_BASS is off, otherwise everything + * sounds all muffled. Running ca0132_effects_set with X_BASS as the + * effect should sort this out. + */ + if (spec->effects_switch[PLAY_ENHANCEMENT - EFFECT_START_NID]) + ca0132_effects_set(codec, X_BASS, + spec->effects_switch[X_BASS - EFFECT_START_NID]); - /* run through the output dsp commands for line-out */ + /* run through the output dsp commands for the selected output. */ for (i = 0; i < alt_out_presets[spec->cur_out_type].commands; i++) { err = dspio_set_uint_param(codec, alt_out_presets[spec->cur_out_type].mids[i], @@ -4152,7 +4459,6 @@ static void ca0132_unsol_hp_delayed(struct work_struct *work) static void ca0132_set_dmic(struct hda_codec *codec, int enable); static int ca0132_mic_boost_set(struct hda_codec *codec, long val); -static int ca0132_effects_set(struct hda_codec *codec, hda_nid_t nid, long val); static void resume_mic1(struct hda_codec *codec, unsigned int oldval); static int stop_mic1(struct hda_codec *codec); static int ca0132_cvoice_switch_set(struct hda_codec *codec); @@ -4341,13 +4647,20 @@ static int ca0132_alt_select_in(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_SBZ: case QUIRK_R3D: - ca0132_mmio_gpio_set(codec, 0, false); + ca0113_mmio_gpio_set(codec, 0, false); + tmp = FLOAT_THREE; + break; + case QUIRK_ZXR: tmp = FLOAT_THREE; break; case QUIRK_R3DI: r3di_gpio_mic_set(codec, R3DI_REAR_MIC); tmp = FLOAT_ONE; break; + case QUIRK_AE5: + ca0113_mmio_command_set(codec, 0x48, 0x28, 0x00); + tmp = FLOAT_THREE; + break; default: tmp = FLOAT_ONE; break; @@ -4362,10 +4675,19 @@ static int ca0132_alt_select_in(struct hda_codec *codec) chipio_set_stream_control(codec, 0x03, 1); chipio_set_stream_control(codec, 0x04, 1); - - if (spec->quirk == QUIRK_SBZ) { + switch (spec->quirk) { + case QUIRK_SBZ: chipio_write(codec, 0x18B098, 0x0000000C); chipio_write(codec, 0x18B09C, 0x0000000C); + break; + case QUIRK_ZXR: + chipio_write(codec, 0x18B098, 0x0000000C); + chipio_write(codec, 0x18B09C, 0x000000CC); + break; + case QUIRK_AE5: + chipio_write(codec, 0x18B098, 0x0000000C); + chipio_write(codec, 0x18B09C, 0x0000004C); + break; } ca0132_alt_mic_boost_set(codec, spec->mic_boost_enum_val); break; @@ -4374,11 +4696,14 @@ static int ca0132_alt_select_in(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_SBZ: case QUIRK_R3D: - ca0132_mmio_gpio_set(codec, 0, false); + ca0113_mmio_gpio_set(codec, 0, false); break; case QUIRK_R3DI: r3di_gpio_mic_set(codec, R3DI_REAR_MIC); break; + case QUIRK_AE5: + ca0113_mmio_command_set(codec, 0x48, 0x28, 0x00); + break; } chipio_set_conn_rate(codec, MEM_CONNID_MICIN1, SR_96_000); @@ -4389,11 +4714,13 @@ static int ca0132_alt_select_in(struct hda_codec *codec) tmp = FLOAT_ZERO; dspio_set_uint_param(codec, 0x80, 0x00, tmp); - if (spec->quirk == QUIRK_SBZ) { + switch (spec->quirk) { + case QUIRK_SBZ: + case QUIRK_AE5: chipio_write(codec, 0x18B098, 0x00000000); chipio_write(codec, 0x18B09C, 0x00000000); + break; } - chipio_set_stream_control(codec, 0x03, 1); chipio_set_stream_control(codec, 0x04, 1); break; @@ -4401,14 +4728,18 @@ static int ca0132_alt_select_in(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_SBZ: case QUIRK_R3D: - ca0132_mmio_gpio_set(codec, 0, true); - ca0132_mmio_gpio_set(codec, 5, false); + ca0113_mmio_gpio_set(codec, 0, true); + ca0113_mmio_gpio_set(codec, 5, false); tmp = FLOAT_THREE; break; case QUIRK_R3DI: r3di_gpio_mic_set(codec, R3DI_FRONT_MIC); tmp = FLOAT_ONE; break; + case QUIRK_AE5: + ca0113_mmio_command_set(codec, 0x48, 0x28, 0x3f); + tmp = FLOAT_THREE; + break; default: tmp = FLOAT_ONE; break; @@ -4424,9 +4755,15 @@ static int ca0132_alt_select_in(struct hda_codec *codec) chipio_set_stream_control(codec, 0x03, 1); chipio_set_stream_control(codec, 0x04, 1); - if (spec->quirk == QUIRK_SBZ) { + switch (spec->quirk) { + case QUIRK_SBZ: chipio_write(codec, 0x18B098, 0x0000000C); chipio_write(codec, 0x18B09C, 0x000000CC); + break; + case QUIRK_AE5: + chipio_write(codec, 0x18B098, 0x0000000C); + chipio_write(codec, 0x18B09C, 0x0000004C); + break; } ca0132_alt_mic_boost_set(codec, spec->mic_boost_enum_val); break; @@ -4435,7 +4772,6 @@ static int ca0132_alt_select_in(struct hda_codec *codec) snd_hda_power_down_pm(codec); return 0; - } /* @@ -4507,6 +4843,8 @@ static int ca0132_effects_set(struct hda_codec *codec, hda_nid_t nid, long val) /* if PE if off, turn off out effects. */ if (!spec->effects_switch[PLAY_ENHANCEMENT - EFFECT_START_NID]) val = 0; + if (spec->cur_out_type == SURROUND_OUT && nid == X_BASS) + val = 0; } /* for in effect, qualify with CrystalVoice */ @@ -4520,7 +4858,7 @@ static int ca0132_effects_set(struct hda_codec *codec, hda_nid_t nid, long val) val = 0; /* If Voice Focus on SBZ, set to two channel. */ - if ((nid == VOICE_FOCUS) && (spec->quirk == QUIRK_SBZ) + if ((nid == VOICE_FOCUS) && (spec->use_pci_mmio) && (spec->cur_mic_type != REAR_LINE_IN)) { if (spec->effects_switch[CRYSTAL_VOICE - EFFECT_START_NID]) { @@ -4539,7 +4877,7 @@ static int ca0132_effects_set(struct hda_codec *codec, hda_nid_t nid, long val) * For SBZ noise reduction, there's an extra command * to module ID 0x47. No clue why. */ - if ((nid == NOISE_REDUCTION) && (spec->quirk == QUIRK_SBZ) + if ((nid == NOISE_REDUCTION) && (spec->use_pci_mmio) && (spec->cur_mic_type != REAR_LINE_IN)) { if (spec->effects_switch[CRYSTAL_VOICE - EFFECT_START_NID]) { @@ -4678,6 +5016,27 @@ static int ca0132_alt_mic_boost_set(struct hda_codec *codec, long val) return ret; } +static int ae5_headphone_gain_set(struct hda_codec *codec, long val) +{ + unsigned int i; + + for (i = 0; i < 4; i++) + ca0113_mmio_command_set(codec, 0x48, 0x11 + i, + ae5_headphone_gain_presets[val].vals[i]); + return 0; +} + +/* + * gpio pin 1 is a relay that switches on/off, apparently setting the headphone + * amplifier to handle a 600 ohm load. + */ +static int zxr_headphone_gain_set(struct hda_codec *codec, long val) +{ + ca0113_mmio_gpio_set(codec, 1, val); + + return 0; +} + static int ca0132_vnode_switch_set(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -4942,66 +5301,172 @@ static int ca0132_alt_mic_boost_put(struct snd_kcontrol *kcontrol, return 1; } - /* - * Input Select Control for alternative ca0132 codecs. This exists because - * front microphone has no auto-detect, and we need a way to set the rear - * as line-in + * Sound BlasterX AE-5 Headphone Gain Controls. */ -static int ca0132_alt_input_source_info(struct snd_kcontrol *kcontrol, +#define AE5_HEADPHONE_GAIN_MAX 3 +static int ae5_headphone_gain_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { + char *sfx = " Ohms)"; + char namestr[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; + uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; uinfo->count = 1; - uinfo->value.enumerated.items = IN_SRC_NUM_OF_INPUTS; - if (uinfo->value.enumerated.item >= IN_SRC_NUM_OF_INPUTS) - uinfo->value.enumerated.item = IN_SRC_NUM_OF_INPUTS - 1; - strcpy(uinfo->value.enumerated.name, - in_src_str[uinfo->value.enumerated.item]); + uinfo->value.enumerated.items = AE5_HEADPHONE_GAIN_MAX; + if (uinfo->value.enumerated.item >= AE5_HEADPHONE_GAIN_MAX) + uinfo->value.enumerated.item = AE5_HEADPHONE_GAIN_MAX - 1; + sprintf(namestr, "%s %s", + ae5_headphone_gain_presets[uinfo->value.enumerated.item].name, + sfx); + strcpy(uinfo->value.enumerated.name, namestr); return 0; } -static int ca0132_alt_input_source_get(struct snd_kcontrol *kcontrol, +static int ae5_headphone_gain_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct hda_codec *codec = snd_kcontrol_chip(kcontrol); struct ca0132_spec *spec = codec->spec; - ucontrol->value.enumerated.item[0] = spec->in_enum_val; + ucontrol->value.enumerated.item[0] = spec->ae5_headphone_gain_val; return 0; } -static int ca0132_alt_input_source_put(struct snd_kcontrol *kcontrol, +static int ae5_headphone_gain_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct hda_codec *codec = snd_kcontrol_chip(kcontrol); struct ca0132_spec *spec = codec->spec; int sel = ucontrol->value.enumerated.item[0]; - unsigned int items = IN_SRC_NUM_OF_INPUTS; + unsigned int items = AE5_HEADPHONE_GAIN_MAX; if (sel >= items) return 0; - codec_dbg(codec, "ca0132_alt_input_select: sel=%d, preset=%s\n", - sel, in_src_str[sel]); + codec_dbg(codec, "ae5_headphone_gain: boost=%d\n", + sel); - spec->in_enum_val = sel; + spec->ae5_headphone_gain_val = sel; - ca0132_alt_select_in(codec); + if (spec->out_enum_val == HEADPHONE_OUT) + ae5_headphone_gain_set(codec, spec->ae5_headphone_gain_val); return 1; } -/* Sound Blaster Z Output Select Control */ -static int ca0132_alt_output_select_get_info(struct snd_kcontrol *kcontrol, +/* + * Sound BlasterX AE-5 sound filter enumerated control. + */ +#define AE5_SOUND_FILTER_MAX 3 + +static int ae5_sound_filter_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { + char namestr[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; + uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; uinfo->count = 1; - uinfo->value.enumerated.items = NUM_OF_OUTPUTS; - if (uinfo->value.enumerated.item >= NUM_OF_OUTPUTS) - uinfo->value.enumerated.item = NUM_OF_OUTPUTS - 1; - strcpy(uinfo->value.enumerated.name, + uinfo->value.enumerated.items = AE5_SOUND_FILTER_MAX; + if (uinfo->value.enumerated.item >= AE5_SOUND_FILTER_MAX) + uinfo->value.enumerated.item = AE5_SOUND_FILTER_MAX - 1; + sprintf(namestr, "%s", + ae5_filter_presets[uinfo->value.enumerated.item].name); + strcpy(uinfo->value.enumerated.name, namestr); + return 0; +} + +static int ae5_sound_filter_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hda_codec *codec = snd_kcontrol_chip(kcontrol); + struct ca0132_spec *spec = codec->spec; + + ucontrol->value.enumerated.item[0] = spec->ae5_filter_val; + return 0; +} + +static int ae5_sound_filter_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hda_codec *codec = snd_kcontrol_chip(kcontrol); + struct ca0132_spec *spec = codec->spec; + int sel = ucontrol->value.enumerated.item[0]; + unsigned int items = AE5_SOUND_FILTER_MAX; + + if (sel >= items) + return 0; + + codec_dbg(codec, "ae5_sound_filter: %s\n", + ae5_filter_presets[sel].name); + + spec->ae5_filter_val = sel; + + ca0113_mmio_command_set_type2(codec, 0x48, 0x07, + ae5_filter_presets[sel].val); + + return 1; +} + +/* + * Input Select Control for alternative ca0132 codecs. This exists because + * front microphone has no auto-detect, and we need a way to set the rear + * as line-in + */ +static int ca0132_alt_input_source_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; + uinfo->count = 1; + uinfo->value.enumerated.items = IN_SRC_NUM_OF_INPUTS; + if (uinfo->value.enumerated.item >= IN_SRC_NUM_OF_INPUTS) + uinfo->value.enumerated.item = IN_SRC_NUM_OF_INPUTS - 1; + strcpy(uinfo->value.enumerated.name, + in_src_str[uinfo->value.enumerated.item]); + return 0; +} + +static int ca0132_alt_input_source_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hda_codec *codec = snd_kcontrol_chip(kcontrol); + struct ca0132_spec *spec = codec->spec; + + ucontrol->value.enumerated.item[0] = spec->in_enum_val; + return 0; +} + +static int ca0132_alt_input_source_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hda_codec *codec = snd_kcontrol_chip(kcontrol); + struct ca0132_spec *spec = codec->spec; + int sel = ucontrol->value.enumerated.item[0]; + unsigned int items = IN_SRC_NUM_OF_INPUTS; + + if (sel >= items) + return 0; + + codec_dbg(codec, "ca0132_alt_input_select: sel=%d, preset=%s\n", + sel, in_src_str[sel]); + + spec->in_enum_val = sel; + + ca0132_alt_select_in(codec); + + return 1; +} + +/* Sound Blaster Z Output Select Control */ +static int ca0132_alt_output_select_get_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; + uinfo->count = 1; + uinfo->value.enumerated.items = NUM_OF_OUTPUTS; + if (uinfo->value.enumerated.item >= NUM_OF_OUTPUTS) + uinfo->value.enumerated.item = NUM_OF_OUTPUTS - 1; + strcpy(uinfo->value.enumerated.name, alt_out_presets[uinfo->value.enumerated.item].name); return 0; } @@ -5330,6 +5795,16 @@ static int ca0132_switch_put(struct snd_kcontrol *kcontrol, goto exit; } + if (nid == ZXR_HEADPHONE_GAIN) { + spec->zxr_gain_set = *valp; + if (spec->cur_out_type == HEADPHONE_OUT) + changed = zxr_headphone_gain_set(codec, *valp); + else + changed = 0; + + goto exit; + } + exit: snd_hda_power_down(codec); return changed; @@ -5704,6 +6179,50 @@ static int ca0132_alt_add_mic_boost_enum(struct hda_codec *codec) } +/* + * Add headphone gain enumerated control for the AE-5. This switches between + * three modes, low, medium, and high. When non-headphone outputs are selected, + * it is automatically set to high. This is the same behavior as Windows. + */ +static int ae5_add_headphone_gain_enum(struct hda_codec *codec) +{ + struct snd_kcontrol_new knew = + HDA_CODEC_MUTE_MONO("AE-5: Headphone Gain", + AE5_HEADPHONE_GAIN_ENUM, 1, 0, HDA_OUTPUT); + knew.info = ae5_headphone_gain_info; + knew.get = ae5_headphone_gain_get; + knew.put = ae5_headphone_gain_put; + return snd_hda_ctl_add(codec, AE5_HEADPHONE_GAIN_ENUM, + snd_ctl_new1(&knew, codec)); +} + +/* + * Add sound filter enumerated control for the AE-5. This adds three different + * settings: Slow Roll Off, Minimum Phase, and Fast Roll Off. From what I've + * read into it, it changes the DAC's interpolation filter. + */ +static int ae5_add_sound_filter_enum(struct hda_codec *codec) +{ + struct snd_kcontrol_new knew = + HDA_CODEC_MUTE_MONO("AE-5: Sound Filter", + AE5_SOUND_FILTER_ENUM, 1, 0, HDA_OUTPUT); + knew.info = ae5_sound_filter_info; + knew.get = ae5_sound_filter_get; + knew.put = ae5_sound_filter_put; + return snd_hda_ctl_add(codec, AE5_SOUND_FILTER_ENUM, + snd_ctl_new1(&knew, codec)); +} + +static int zxr_add_headphone_gain_switch(struct hda_codec *codec) +{ + struct snd_kcontrol_new knew = + CA0132_CODEC_MUTE_MONO("ZxR: 600 Ohm Gain", + ZXR_HEADPHONE_GAIN, 1, HDA_OUTPUT); + + return snd_hda_ctl_add(codec, ZXR_HEADPHONE_GAIN, + snd_ctl_new1(&knew, codec)); +} + /* * Need to create slave controls for the alternate codecs that have surround * capabilities. @@ -5847,7 +6366,8 @@ static int ca0132_build_controls(struct hda_codec *codec) NULL, ca0132_alt_slave_pfxs, "Playback Switch", true, &spec->vmaster_mute.sw_kctl); - + if (err < 0) + return err; } /* Add in and out effects controls. @@ -5855,8 +6375,8 @@ static int ca0132_build_controls(struct hda_codec *codec) */ num_fx = OUT_EFFECTS_COUNT + IN_EFFECTS_COUNT; for (i = 0; i < num_fx; i++) { - /* SBZ and R3D break if Echo Cancellation is used. */ - if (spec->quirk == QUIRK_SBZ || spec->quirk == QUIRK_R3D) { + /* Desktop cards break if Echo Cancellation is used. */ + if (spec->use_pci_mmio) { if (i == (ECHO_CANCELLATION - IN_EFFECT_START_NID + OUT_EFFECTS_COUNT)) continue; @@ -5874,8 +6394,14 @@ static int ca0132_build_controls(struct hda_codec *codec) * prefix, and change PlayEnhancement and CrystalVoice to match. */ if (spec->use_alt_controls) { - ca0132_alt_add_svm_enum(codec); - add_ca0132_alt_eq_presets(codec); + err = ca0132_alt_add_svm_enum(codec); + if (err < 0) + return err; + + err = add_ca0132_alt_eq_presets(codec); + if (err < 0) + return err; + err = add_fx_switch(codec, PLAY_ENHANCEMENT, "Enable OutFX", 0); if (err < 0) @@ -5912,7 +6438,9 @@ static int ca0132_build_controls(struct hda_codec *codec) if (err < 0) return err; } - add_voicefx(codec); + err = add_voicefx(codec); + if (err < 0) + return err; /* * If the codec uses alt_functions, you need the enumerated controls @@ -5920,9 +6448,36 @@ static int ca0132_build_controls(struct hda_codec *codec) * setting control. */ if (spec->use_alt_functions) { - ca0132_alt_add_output_enum(codec); - ca0132_alt_add_input_enum(codec); - ca0132_alt_add_mic_boost_enum(codec); + err = ca0132_alt_add_output_enum(codec); + if (err < 0) + return err; + err = ca0132_alt_add_mic_boost_enum(codec); + if (err < 0) + return err; + /* + * ZxR only has microphone input, there is no front panel + * header on the card, and aux-in is handled by the DBPro board. + */ + if (spec->quirk != QUIRK_ZXR) { + err = ca0132_alt_add_input_enum(codec); + if (err < 0) + return err; + } + } + + if (spec->quirk == QUIRK_AE5) { + err = ae5_add_headphone_gain_enum(codec); + if (err < 0) + return err; + err = ae5_add_sound_filter_enum(codec); + if (err < 0) + return err; + } + + if (spec->quirk == QUIRK_ZXR) { + err = zxr_add_headphone_gain_switch(codec); + if (err < 0) + return err; } #ifdef ENABLE_TUNING_CONTROLS add_tuning_ctls(codec); @@ -5955,6 +6510,27 @@ static int ca0132_build_controls(struct hda_codec *codec) return 0; } +static int dbpro_build_controls(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + int err = 0; + + if (spec->dig_out) { + err = snd_hda_create_spdif_out_ctls(codec, spec->dig_out, + spec->dig_out); + if (err < 0) + return err; + } + + if (spec->dig_in) { + err = snd_hda_create_spdif_in_ctls(codec, spec->dig_in); + if (err < 0) + return err; + } + + return 0; +} + /* * PCM */ @@ -6058,6 +6634,40 @@ static int ca0132_build_pcms(struct hda_codec *codec) return 0; } +static int dbpro_build_pcms(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + struct hda_pcm *info; + + info = snd_hda_codec_pcm_new(codec, "CA0132 Alt Analog"); + if (!info) + return -ENOMEM; + info->stream[SNDRV_PCM_STREAM_CAPTURE] = ca0132_pcm_analog_capture; + info->stream[SNDRV_PCM_STREAM_CAPTURE].substreams = 1; + info->stream[SNDRV_PCM_STREAM_CAPTURE].nid = spec->adcs[0]; + + + if (!spec->dig_out && !spec->dig_in) + return 0; + + info = snd_hda_codec_pcm_new(codec, "CA0132 Digital"); + if (!info) + return -ENOMEM; + info->pcm_type = HDA_PCM_TYPE_SPDIF; + if (spec->dig_out) { + info->stream[SNDRV_PCM_STREAM_PLAYBACK] = + ca0132_pcm_digital_playback; + info->stream[SNDRV_PCM_STREAM_PLAYBACK].nid = spec->dig_out; + } + if (spec->dig_in) { + info->stream[SNDRV_PCM_STREAM_CAPTURE] = + ca0132_pcm_digital_capture; + info->stream[SNDRV_PCM_STREAM_CAPTURE].nid = spec->dig_in; + } + + return 0; +} + static void init_output(struct hda_codec *codec, hda_nid_t pin, hda_nid_t dac) { if (pin) { @@ -6238,69 +6848,48 @@ static void ca0132_refresh_widget_caps(struct hda_codec *codec) } /* - * Recon3D r3d_setup_defaults sub functions. + * Creates a dummy stream to bind the output to. This seems to have to be done + * after changing the main outputs source and destination streams. */ - -static void r3d_dsp_scp_startup(struct hda_codec *codec) +static void ca0132_alt_create_dummy_stream(struct hda_codec *codec) { - unsigned int tmp; - - tmp = 0x00000000; - dspio_set_uint_param_no_source(codec, 0x80, 0x0A, tmp); - - tmp = 0x00000001; - dspio_set_uint_param_no_source(codec, 0x80, 0x0B, tmp); - - tmp = 0x00000004; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - - tmp = 0x00000005; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - - tmp = 0x00000000; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - -} + struct ca0132_spec *spec = codec->spec; + unsigned int stream_format; -static void r3d_dsp_initial_mic_setup(struct hda_codec *codec) -{ - unsigned int tmp; + stream_format = snd_hdac_calc_stream_format(48000, 2, + SNDRV_PCM_FORMAT_S32_LE, 32, 0); - /* Mic 1 Setup */ - chipio_set_conn_rate(codec, MEM_CONNID_MICIN1, SR_96_000); - chipio_set_conn_rate(codec, MEM_CONNID_MICOUT1, SR_96_000); - /* This ConnPointID is unique to Recon3Di. Haven't seen it elsewhere */ - chipio_set_conn_rate(codec, 0x0F, SR_96_000); - tmp = FLOAT_ONE; - dspio_set_uint_param(codec, 0x80, 0x00, tmp); + snd_hda_codec_setup_stream(codec, spec->dacs[0], spec->dsp_stream_id, + 0, stream_format); - /* Mic 2 Setup, even though it isn't connected on SBZ */ - chipio_set_conn_rate(codec, MEM_CONNID_MICIN2, SR_96_000); - chipio_set_conn_rate(codec, MEM_CONNID_MICOUT2, SR_96_000); - chipio_set_conn_rate(codec, 0x0F, SR_96_000); - tmp = FLOAT_ZERO; - dspio_set_uint_param(codec, 0x80, 0x01, tmp); + snd_hda_codec_cleanup_stream(codec, spec->dacs[0]); } /* - * Initialize Sound Blaster Z analog microphones. + * Initialize mic for non-chromebook ca0132 implementations. */ -static void sbz_init_analog_mics(struct hda_codec *codec) +static void ca0132_alt_init_analog_mics(struct hda_codec *codec) { + struct ca0132_spec *spec = codec->spec; unsigned int tmp; /* Mic 1 Setup */ chipio_set_conn_rate(codec, MEM_CONNID_MICIN1, SR_96_000); chipio_set_conn_rate(codec, MEM_CONNID_MICOUT1, SR_96_000); - tmp = FLOAT_THREE; + if (spec->quirk == QUIRK_R3DI) { + chipio_set_conn_rate(codec, 0x0F, SR_96_000); + tmp = FLOAT_ONE; + } else + tmp = FLOAT_THREE; dspio_set_uint_param(codec, 0x80, 0x00, tmp); - /* Mic 2 Setup, even though it isn't connected on SBZ */ + /* Mic 2 setup (not present on desktop cards) */ chipio_set_conn_rate(codec, MEM_CONNID_MICIN2, SR_96_000); chipio_set_conn_rate(codec, MEM_CONNID_MICOUT2, SR_96_000); + if (spec->quirk == QUIRK_R3DI) + chipio_set_conn_rate(codec, 0x0F, SR_96_000); tmp = FLOAT_ZERO; dspio_set_uint_param(codec, 0x80, 0x01, tmp); - } /* @@ -6333,7 +6922,6 @@ static void sbz_connect_streams(struct hda_codec *codec) codec_dbg(codec, "Connect Streams exited, mutex released.\n"); mutex_unlock(&spec->chipio_mutex); - } /* @@ -6360,19 +6948,29 @@ static void sbz_chipio_startup_data(struct hda_codec *codec) chipio_set_stream_channels(codec, 0x0C, 6); chipio_set_stream_control(codec, 0x0C, 1); /* No clue what these control */ - chipio_write_no_mutex(codec, 0x190030, 0x0001e0c0); - chipio_write_no_mutex(codec, 0x190034, 0x0001e1c1); - chipio_write_no_mutex(codec, 0x190038, 0x0001e4c2); - chipio_write_no_mutex(codec, 0x19003c, 0x0001e5c3); - chipio_write_no_mutex(codec, 0x190040, 0x0001e2c4); - chipio_write_no_mutex(codec, 0x190044, 0x0001e3c5); - chipio_write_no_mutex(codec, 0x190048, 0x0001e8c6); - chipio_write_no_mutex(codec, 0x19004c, 0x0001e9c7); - chipio_write_no_mutex(codec, 0x190050, 0x0001ecc8); - chipio_write_no_mutex(codec, 0x190054, 0x0001edc9); - chipio_write_no_mutex(codec, 0x190058, 0x0001eaca); - chipio_write_no_mutex(codec, 0x19005c, 0x0001ebcb); - + if (spec->quirk == QUIRK_SBZ) { + chipio_write_no_mutex(codec, 0x190030, 0x0001e0c0); + chipio_write_no_mutex(codec, 0x190034, 0x0001e1c1); + chipio_write_no_mutex(codec, 0x190038, 0x0001e4c2); + chipio_write_no_mutex(codec, 0x19003c, 0x0001e5c3); + chipio_write_no_mutex(codec, 0x190040, 0x0001e2c4); + chipio_write_no_mutex(codec, 0x190044, 0x0001e3c5); + chipio_write_no_mutex(codec, 0x190048, 0x0001e8c6); + chipio_write_no_mutex(codec, 0x19004c, 0x0001e9c7); + chipio_write_no_mutex(codec, 0x190050, 0x0001ecc8); + chipio_write_no_mutex(codec, 0x190054, 0x0001edc9); + chipio_write_no_mutex(codec, 0x190058, 0x0001eaca); + chipio_write_no_mutex(codec, 0x19005c, 0x0001ebcb); + } else if (spec->quirk == QUIRK_ZXR) { + chipio_write_no_mutex(codec, 0x190038, 0x000140c2); + chipio_write_no_mutex(codec, 0x19003c, 0x000141c3); + chipio_write_no_mutex(codec, 0x190040, 0x000150c4); + chipio_write_no_mutex(codec, 0x190044, 0x000151c5); + chipio_write_no_mutex(codec, 0x190050, 0x000142c8); + chipio_write_no_mutex(codec, 0x190054, 0x000143c9); + chipio_write_no_mutex(codec, 0x190058, 0x000152ca); + chipio_write_no_mutex(codec, 0x19005c, 0x000153cb); + } chipio_write_no_mutex(codec, 0x19042c, 0x00000001); codec_dbg(codec, "Startup Data exited, mutex released.\n"); @@ -6380,35 +6978,56 @@ static void sbz_chipio_startup_data(struct hda_codec *codec) } /* - * Sound Blaster Z uses these after DSP is loaded. Weird SCP commands - * without a 0x20 source like normal. + * Custom DSP SCP commands where the src value is 0x00 instead of 0x20. This is + * done after the DSP is loaded. */ -static void sbz_dsp_scp_startup(struct hda_codec *codec) +static void ca0132_alt_dsp_scp_startup(struct hda_codec *codec) { - unsigned int tmp; - - tmp = 0x00000003; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - - tmp = 0x00000000; - dspio_set_uint_param_no_source(codec, 0x80, 0x0A, tmp); - - tmp = 0x00000001; - dspio_set_uint_param_no_source(codec, 0x80, 0x0B, tmp); - - tmp = 0x00000004; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - - tmp = 0x00000005; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); - - tmp = 0x00000000; - dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + struct ca0132_spec *spec = codec->spec; + unsigned int tmp, i; + /* + * Gotta run these twice, or else mic works inconsistently. Not clear + * why this is, but multiple tests have confirmed it. + */ + for (i = 0; i < 2; i++) { + switch (spec->quirk) { + case QUIRK_SBZ: + case QUIRK_AE5: + tmp = 0x00000003; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + tmp = 0x00000000; + dspio_set_uint_param_no_source(codec, 0x80, 0x0A, tmp); + tmp = 0x00000001; + dspio_set_uint_param_no_source(codec, 0x80, 0x0B, tmp); + tmp = 0x00000004; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + tmp = 0x00000005; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + tmp = 0x00000000; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + break; + case QUIRK_R3D: + case QUIRK_R3DI: + tmp = 0x00000000; + dspio_set_uint_param_no_source(codec, 0x80, 0x0A, tmp); + tmp = 0x00000001; + dspio_set_uint_param_no_source(codec, 0x80, 0x0B, tmp); + tmp = 0x00000004; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + tmp = 0x00000005; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + tmp = 0x00000000; + dspio_set_uint_param_no_source(codec, 0x80, 0x0C, tmp); + break; + } + msleep(100); + } } -static void sbz_dsp_initial_mic_setup(struct hda_codec *codec) +static void ca0132_alt_dsp_initial_mic_setup(struct hda_codec *codec) { + struct ca0132_spec *spec = codec->spec; unsigned int tmp; chipio_set_stream_control(codec, 0x03, 0); @@ -6423,17 +7042,170 @@ static void sbz_dsp_initial_mic_setup(struct hda_codec *codec) chipio_set_stream_control(codec, 0x03, 1); chipio_set_stream_control(codec, 0x04, 1); - chipio_write(codec, 0x18b098, 0x0000000c); - chipio_write(codec, 0x18b09C, 0x0000000c); + switch (spec->quirk) { + case QUIRK_SBZ: + chipio_write(codec, 0x18b098, 0x0000000c); + chipio_write(codec, 0x18b09C, 0x0000000c); + break; + case QUIRK_AE5: + chipio_write(codec, 0x18b098, 0x0000000c); + chipio_write(codec, 0x18b09c, 0x0000004c); + break; + } } -/* - * Setup default parameters for DSP - */ -static void ca0132_setup_defaults(struct hda_codec *codec) +static void ae5_post_dsp_register_set(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - unsigned int tmp; + + chipio_8051_write_direct(codec, 0x93, 0x10); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x44); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xc2); + + writeb(0xff, spec->mem_base + 0x304); + writeb(0xff, spec->mem_base + 0x304); + writeb(0xff, spec->mem_base + 0x304); + writeb(0xff, spec->mem_base + 0x304); + writeb(0x00, spec->mem_base + 0x100); + writeb(0xff, spec->mem_base + 0x304); + writeb(0x00, spec->mem_base + 0x100); + writeb(0xff, spec->mem_base + 0x304); + writeb(0x00, spec->mem_base + 0x100); + writeb(0xff, spec->mem_base + 0x304); + writeb(0x00, spec->mem_base + 0x100); + writeb(0xff, spec->mem_base + 0x304); + + ca0113_mmio_command_set(codec, 0x30, 0x2b, 0x3f); + ca0113_mmio_command_set(codec, 0x30, 0x2d, 0x3f); + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x83); +} + +static void ae5_post_dsp_param_setup(struct hda_codec *codec) +{ + /* + * Param3 in the 8051's memory is represented by the ascii string 'mch' + * which seems to be 'multichannel'. This is also mentioned in the + * AE-5's registry values in Windows. + */ + chipio_set_control_param(codec, 3, 0); + /* + * I believe ASI is 'audio serial interface' and that it's used to + * change colors on the external LED strip connected to the AE-5. + */ + chipio_set_control_flag(codec, CONTROL_FLAG_ASI_96KHZ, 1); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, 0x724, 0x83); + chipio_set_control_param(codec, CONTROL_PARAM_ASI, 0); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x92); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_HIGH, 0xfa); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_DATA_WRITE, 0x22); +} + +static void ae5_post_dsp_pll_setup(struct hda_codec *codec) +{ + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x41); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xc8); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x45); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xcc); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x40); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xcb); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x43); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xc7); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x51); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0x8d); +} + +static void ae5_post_dsp_stream_setup(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + mutex_lock(&spec->chipio_mutex); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, 0x725, 0x81); + + chipio_set_conn_rate_no_mutex(codec, 0x70, SR_96_000); + + chipio_set_stream_channels(codec, 0x0C, 6); + chipio_set_stream_control(codec, 0x0C, 1); + + chipio_set_stream_source_dest(codec, 0x5, 0x43, 0x0); + + chipio_set_stream_source_dest(codec, 0x18, 0x9, 0xd0); + chipio_set_conn_rate_no_mutex(codec, 0xd0, SR_96_000); + chipio_set_stream_channels(codec, 0x18, 6); + chipio_set_stream_control(codec, 0x18, 1); + + chipio_set_control_param_no_mutex(codec, CONTROL_PARAM_ASI, 4); + + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x43); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xc7); + + ca0113_mmio_command_set(codec, 0x48, 0x01, 0x80); + + mutex_unlock(&spec->chipio_mutex); +} + +static void ae5_post_dsp_startup_data(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + mutex_lock(&spec->chipio_mutex); + + chipio_write_no_mutex(codec, 0x189000, 0x0001f101); + chipio_write_no_mutex(codec, 0x189004, 0x0001f101); + chipio_write_no_mutex(codec, 0x189024, 0x00014004); + chipio_write_no_mutex(codec, 0x189028, 0x0002000f); + + ca0113_mmio_command_set(codec, 0x48, 0x0a, 0x05); + chipio_set_control_param_no_mutex(codec, CONTROL_PARAM_ASI, 7); + ca0113_mmio_command_set(codec, 0x48, 0x0b, 0x12); + ca0113_mmio_command_set(codec, 0x48, 0x04, 0x00); + ca0113_mmio_command_set(codec, 0x48, 0x06, 0x48); + ca0113_mmio_command_set(codec, 0x48, 0x0a, 0x05); + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x83); + ca0113_mmio_command_set(codec, 0x48, 0x0f, 0x00); + ca0113_mmio_command_set(codec, 0x48, 0x10, 0x00); + ca0113_mmio_gpio_set(codec, 0, true); + ca0113_mmio_gpio_set(codec, 1, true); + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x80); + + chipio_write_no_mutex(codec, 0x18b03c, 0x00000012); + + ca0113_mmio_command_set(codec, 0x48, 0x0f, 0x00); + ca0113_mmio_command_set(codec, 0x48, 0x10, 0x00); + + mutex_unlock(&spec->chipio_mutex); +} + +/* + * Setup default parameters for DSP + */ +static void ca0132_setup_defaults(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int tmp; int num_fx; int idx, i; @@ -6485,9 +7257,8 @@ static void r3d_setup_defaults(struct hda_codec *codec) if (spec->dsp_state != DSP_DOWNLOADED) return; - r3d_dsp_scp_startup(codec); - - r3d_dsp_initial_mic_setup(codec); + ca0132_alt_dsp_scp_startup(codec); + ca0132_alt_init_analog_mics(codec); /*remove DSP headroom*/ tmp = FLOAT_ZERO; @@ -6523,19 +7294,16 @@ static void r3d_setup_defaults(struct hda_codec *codec) static void sbz_setup_defaults(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - unsigned int tmp, stream_format; + unsigned int tmp; int num_fx; int idx, i; if (spec->dsp_state != DSP_DOWNLOADED) return; - sbz_dsp_scp_startup(codec); - - sbz_init_analog_mics(codec); - + ca0132_alt_dsp_scp_startup(codec); + ca0132_alt_init_analog_mics(codec); sbz_connect_streams(codec); - sbz_chipio_startup_data(codec); chipio_set_stream_control(codec, 0x03, 1); @@ -6561,8 +7329,7 @@ static void sbz_setup_defaults(struct hda_codec *codec) /* Set speaker source? */ dspio_set_uint_param(codec, 0x32, 0x00, tmp); - sbz_dsp_initial_mic_setup(codec); - + ca0132_alt_dsp_initial_mic_setup(codec); /* out, in effects + voicefx */ num_fx = OUT_EFFECTS_COUNT + IN_EFFECTS_COUNT + 1; @@ -6575,23 +7342,74 @@ static void sbz_setup_defaults(struct hda_codec *codec) } } - /* - * Have to make a stream to bind the sound output to, otherwise - * you'll get dead audio. Before I did this, it would bind to an - * audio input, and would never work - */ - stream_format = snd_hdac_calc_stream_format(48000, 2, - SNDRV_PCM_FORMAT_S32_LE, 32, 0); + ca0132_alt_create_dummy_stream(codec); +} - snd_hda_codec_setup_stream(codec, spec->dacs[0], spec->dsp_stream_id, - 0, stream_format); +/* + * Setup default parameters for the Sound BlasterX AE-5 DSP. + */ +static void ae5_setup_defaults(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + unsigned int tmp; + int num_fx; + int idx, i; - snd_hda_codec_cleanup_stream(codec, spec->dacs[0]); + if (spec->dsp_state != DSP_DOWNLOADED) + return; - snd_hda_codec_setup_stream(codec, spec->dacs[0], spec->dsp_stream_id, - 0, stream_format); + ca0132_alt_dsp_scp_startup(codec); + ca0132_alt_init_analog_mics(codec); + chipio_set_stream_control(codec, 0x03, 1); + chipio_set_stream_control(codec, 0x04, 1); - snd_hda_codec_cleanup_stream(codec, spec->dacs[0]); + /* New, unknown SCP req's */ + tmp = FLOAT_ZERO; + dspio_set_uint_param(codec, 0x96, 0x29, tmp); + dspio_set_uint_param(codec, 0x96, 0x2a, tmp); + dspio_set_uint_param(codec, 0x80, 0x0d, tmp); + dspio_set_uint_param(codec, 0x80, 0x0e, tmp); + + ca0113_mmio_command_set(codec, 0x30, 0x2e, 0x3f); + ca0113_mmio_gpio_set(codec, 0, false); + ca0113_mmio_command_set(codec, 0x30, 0x28, 0x00); + + /* Internal loopback off */ + tmp = FLOAT_ONE; + dspio_set_uint_param(codec, 0x37, 0x08, tmp); + dspio_set_uint_param(codec, 0x37, 0x10, tmp); + + /*remove DSP headroom*/ + tmp = FLOAT_ZERO; + dspio_set_uint_param(codec, 0x96, 0x3C, tmp); + + /* set WUH source */ + tmp = FLOAT_TWO; + dspio_set_uint_param(codec, 0x31, 0x00, tmp); + chipio_set_conn_rate(codec, MEM_CONNID_WUH, SR_48_000); + + /* Set speaker source? */ + dspio_set_uint_param(codec, 0x32, 0x00, tmp); + + ca0132_alt_dsp_initial_mic_setup(codec); + ae5_post_dsp_register_set(codec); + ae5_post_dsp_param_setup(codec); + ae5_post_dsp_pll_setup(codec); + ae5_post_dsp_stream_setup(codec); + ae5_post_dsp_startup_data(codec); + + /* out, in effects + voicefx */ + num_fx = OUT_EFFECTS_COUNT + IN_EFFECTS_COUNT + 1; + for (idx = 0; idx < num_fx; idx++) { + for (i = 0; i <= ca0132_effects[idx].params; i++) { + dspio_set_uint_param(codec, + ca0132_effects[idx].mid, + ca0132_effects[idx].reqs[i], + ca0132_effects[idx].def_vals[i]); + } + } + + ca0132_alt_create_dummy_stream(codec); } /* @@ -6673,12 +7491,14 @@ static bool ca0132_download_dsp_images(struct hda_codec *codec) */ switch (spec->quirk) { case QUIRK_SBZ: - if (request_firmware(&fw_entry, SBZ_EFX_FILE, + case QUIRK_R3D: + case QUIRK_AE5: + if (request_firmware(&fw_entry, DESKTOP_EFX_FILE, codec->card->dev) != 0) { - codec_dbg(codec, "SBZ alt firmware not detected. "); + codec_dbg(codec, "Desktop firmware not found."); spec->alt_firmware_present = false; } else { - codec_dbg(codec, "Sound Blaster Z firmware selected."); + codec_dbg(codec, "Desktop firmware selected."); spec->alt_firmware_present = true; } break; @@ -6921,6 +7741,14 @@ static void ca0132_init_chip(struct hda_codec *codec) spec->effects_switch[PLAY_ENHANCEMENT - EFFECT_START_NID] = 1; spec->effects_switch[CRYSTAL_VOICE - EFFECT_START_NID] = 0; + /* + * The ZxR doesn't have a front panel header, and it's line-in is on + * the daughter board. So, there is no input enum control, and we need + * to make sure that spec->in_enum_val is set properly. + */ + if (spec->quirk == QUIRK_ZXR) + spec->in_enum_val = REAR_MIC; + #ifdef ENABLE_TUNING_CONTROLS ca0132_init_tuning_defaults(codec); #endif @@ -6948,11 +7776,11 @@ static void sbz_region2_exit(struct hda_codec *codec) for (i = 0; i < 8; i++) writeb(0xb3, spec->mem_base + 0x304); - ca0132_mmio_gpio_set(codec, 0, false); - ca0132_mmio_gpio_set(codec, 1, false); - ca0132_mmio_gpio_set(codec, 4, true); - ca0132_mmio_gpio_set(codec, 5, false); - ca0132_mmio_gpio_set(codec, 7, false); + ca0113_mmio_gpio_set(codec, 0, false); + ca0113_mmio_gpio_set(codec, 1, false); + ca0113_mmio_gpio_set(codec, 4, true); + ca0113_mmio_gpio_set(codec, 5, false); + ca0113_mmio_gpio_set(codec, 7, false); } static void sbz_set_pin_ctl_default(struct hda_codec *codec) @@ -6995,6 +7823,16 @@ static void sbz_gpio_shutdown_commands(struct hda_codec *codec, int dir, AC_VERB_SET_GPIO_DATA, data); } +static void zxr_dbpro_power_state_shutdown(struct hda_codec *codec) +{ + hda_nid_t pins[7] = {0x05, 0x0c, 0x09, 0x0e, 0x08, 0x11, 0x01}; + unsigned int i; + + for (i = 0; i < 7; i++) + snd_hda_codec_write(codec, pins[i], 0, + AC_VERB_SET_POWER_STATE, 0x03); +} + static void sbz_exit_chip(struct hda_codec *codec) { chipio_set_stream_control(codec, 0x03, 0); @@ -7037,6 +7875,61 @@ static void r3d_exit_chip(struct hda_codec *codec) snd_hda_codec_write(codec, 0x01, 0, 0x794, 0x5b); } +static void ae5_exit_chip(struct hda_codec *codec) +{ + chipio_set_stream_control(codec, 0x03, 0); + chipio_set_stream_control(codec, 0x04, 0); + + ca0113_mmio_command_set(codec, 0x30, 0x32, 0x3f); + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x83); + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x83); + ca0113_mmio_command_set(codec, 0x30, 0x30, 0x00); + ca0113_mmio_command_set(codec, 0x30, 0x2b, 0x00); + ca0113_mmio_command_set(codec, 0x30, 0x2d, 0x00); + ca0113_mmio_gpio_set(codec, 0, false); + ca0113_mmio_gpio_set(codec, 1, false); + + snd_hda_codec_write(codec, 0x01, 0, 0x793, 0x00); + snd_hda_codec_write(codec, 0x01, 0, 0x794, 0x53); + + chipio_set_control_param(codec, CONTROL_PARAM_ASI, 0); + + chipio_set_stream_control(codec, 0x18, 0); + chipio_set_stream_control(codec, 0x0c, 0); + + snd_hda_codec_write(codec, 0x01, 0, 0x724, 0x83); +} + +static void zxr_exit_chip(struct hda_codec *codec) +{ + chipio_set_stream_control(codec, 0x03, 0); + chipio_set_stream_control(codec, 0x04, 0); + chipio_set_stream_control(codec, 0x14, 0); + chipio_set_stream_control(codec, 0x0C, 0); + + chipio_set_conn_rate(codec, 0x41, SR_192_000); + chipio_set_conn_rate(codec, 0x91, SR_192_000); + + chipio_write(codec, 0x18a020, 0x00000083); + + snd_hda_codec_write(codec, 0x01, 0, 0x793, 0x00); + snd_hda_codec_write(codec, 0x01, 0, 0x794, 0x53); + + ca0132_clear_unsolicited(codec); + sbz_set_pin_ctl_default(codec); + snd_hda_codec_write(codec, 0x0B, 0, AC_VERB_SET_EAPD_BTLENABLE, 0x00); + + ca0113_mmio_gpio_set(codec, 5, false); + ca0113_mmio_gpio_set(codec, 2, false); + ca0113_mmio_gpio_set(codec, 3, false); + ca0113_mmio_gpio_set(codec, 0, false); + ca0113_mmio_gpio_set(codec, 4, true); + ca0113_mmio_gpio_set(codec, 0, true); + ca0113_mmio_gpio_set(codec, 5, true); + ca0113_mmio_gpio_set(codec, 2, false); + ca0113_mmio_gpio_set(codec, 3, false); +} + static void ca0132_exit_chip(struct hda_codec *codec) { /* put any chip cleanup stuffs here. */ @@ -7140,11 +8033,6 @@ static void sbz_pre_dsp_setup(struct hda_codec *codec) writel(0x00820680, spec->mem_base + 0x01C); writel(0x00820680, spec->mem_base + 0x01C); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfc); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfd); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfe); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xff); - chipio_write(codec, 0x18b0a4, 0x000000c2); snd_hda_codec_write(codec, 0x11, 0, @@ -7153,12 +8041,6 @@ static void sbz_pre_dsp_setup(struct hda_codec *codec) static void r3d_pre_dsp_setup(struct hda_codec *codec) { - - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfc); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfd); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xfe); - snd_hda_codec_write(codec, 0x15, 0, 0xd00, 0xff); - chipio_write(codec, 0x18b0a4, 0x000000c2); snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, @@ -7205,23 +8087,116 @@ static void ca0132_mmio_init(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - writel(0x00000000, spec->mem_base + 0x400); - writel(0x00000000, spec->mem_base + 0x408); - writel(0x00000000, spec->mem_base + 0x40C); - writel(0x00880680, spec->mem_base + 0x01C); - writel(0x00000083, spec->mem_base + 0xC0C); + if (spec->quirk == QUIRK_AE5) + writel(0x00000001, spec->mem_base + 0x400); + else + writel(0x00000000, spec->mem_base + 0x400); + + if (spec->quirk == QUIRK_AE5) + writel(0x00000001, spec->mem_base + 0x408); + else + writel(0x00000000, spec->mem_base + 0x408); + + if (spec->quirk == QUIRK_AE5) + writel(0x00000001, spec->mem_base + 0x40c); + else + writel(0x00000000, spec->mem_base + 0x40C); + + if (spec->quirk == QUIRK_ZXR) + writel(0x00880640, spec->mem_base + 0x01C); + else + writel(0x00880680, spec->mem_base + 0x01C); + + if (spec->quirk == QUIRK_AE5) + writel(0x00000080, spec->mem_base + 0xC0C); + else + writel(0x00000083, spec->mem_base + 0xC0C); + writel(0x00000030, spec->mem_base + 0xC00); writel(0x00000000, spec->mem_base + 0xC04); + + if (spec->quirk == QUIRK_AE5) + writel(0x00000000, spec->mem_base + 0xC0C); + else + writel(0x00000003, spec->mem_base + 0xC0C); + writel(0x00000003, spec->mem_base + 0xC0C); writel(0x00000003, spec->mem_base + 0xC0C); writel(0x00000003, spec->mem_base + 0xC0C); - writel(0x00000003, spec->mem_base + 0xC0C); - writel(0x000000C1, spec->mem_base + 0xC08); + + if (spec->quirk == QUIRK_AE5) + writel(0x00000001, spec->mem_base + 0xC08); + else + writel(0x000000C1, spec->mem_base + 0xC08); + writel(0x000000F1, spec->mem_base + 0xC08); writel(0x00000001, spec->mem_base + 0xC08); writel(0x000000C7, spec->mem_base + 0xC08); writel(0x000000C1, spec->mem_base + 0xC08); writel(0x00000080, spec->mem_base + 0xC04); + + if (spec->quirk == QUIRK_AE5) { + writel(0x00000000, spec->mem_base + 0x42c); + writel(0x00000000, spec->mem_base + 0x46c); + writel(0x00000000, spec->mem_base + 0x4ac); + writel(0x00000000, spec->mem_base + 0x4ec); + writel(0x00000000, spec->mem_base + 0x43c); + writel(0x00000000, spec->mem_base + 0x47c); + writel(0x00000000, spec->mem_base + 0x4bc); + writel(0x00000000, spec->mem_base + 0x4fc); + writel(0x00000600, spec->mem_base + 0x100); + writel(0x00000014, spec->mem_base + 0x410); + writel(0x0000060f, spec->mem_base + 0x100); + writel(0x0000070f, spec->mem_base + 0x100); + writel(0x00000aff, spec->mem_base + 0x830); + writel(0x00000000, spec->mem_base + 0x86c); + writel(0x0000006b, spec->mem_base + 0x800); + writel(0x00000001, spec->mem_base + 0x86c); + writel(0x0000006b, spec->mem_base + 0x800); + writel(0x00000057, spec->mem_base + 0x804); + writel(0x00800000, spec->mem_base + 0x20c); + } +} + +/* + * This function writes to some SFR's, does some region2 writes, and then + * eventually resets the codec with the 0x7ff verb. Not quite sure why it does + * what it does. + */ +static void ae5_register_set(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + chipio_8051_write_direct(codec, 0x93, 0x10); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x44); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0xc2); + + writeb(0x0f, spec->mem_base + 0x304); + writeb(0x0f, spec->mem_base + 0x304); + writeb(0x0f, spec->mem_base + 0x304); + writeb(0x0f, spec->mem_base + 0x304); + writeb(0x0e, spec->mem_base + 0x100); + writeb(0x1f, spec->mem_base + 0x304); + writeb(0x0c, spec->mem_base + 0x100); + writeb(0x3f, spec->mem_base + 0x304); + writeb(0x08, spec->mem_base + 0x100); + writeb(0x7f, spec->mem_base + 0x304); + writeb(0x00, spec->mem_base + 0x100); + writeb(0xff, spec->mem_base + 0x304); + + ca0113_mmio_command_set(codec, 0x30, 0x2d, 0x3f); + + chipio_8051_write_direct(codec, 0x90, 0x00); + chipio_8051_write_direct(codec, 0x90, 0x10); + + ca0113_mmio_command_set(codec, 0x48, 0x07, 0x83); + + chipio_write(codec, 0x18b0a4, 0x000000c2); + + snd_hda_codec_write(codec, 0x01, 0, 0x7ff, 0x00); + snd_hda_codec_write(codec, 0x01, 0, 0x7ff, 0x00); } /* @@ -7257,6 +8232,21 @@ static void ca0132_alt_init(struct hda_codec *codec) snd_hda_sequence_write(codec, spec->chip_init_verbs); snd_hda_sequence_write(codec, spec->desktop_init_verbs); break; + case QUIRK_AE5: + ca0132_gpio_init(codec); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_8051_ADDRESS_LOW, 0x49); + snd_hda_codec_write(codec, WIDGET_CHIP_CTRL, 0, + VENDOR_CHIPIO_PLL_PMU_WRITE, 0x88); + chipio_write(codec, 0x18b030, 0x00000020); + snd_hda_sequence_write(codec, spec->chip_init_verbs); + snd_hda_sequence_write(codec, spec->desktop_init_verbs); + ca0113_mmio_command_set(codec, 0x30, 0x32, 0x3f); + break; + case QUIRK_ZXR: + snd_hda_sequence_write(codec, spec->chip_init_verbs); + snd_hda_sequence_write(codec, spec->desktop_init_verbs); + break; } } @@ -7298,6 +8288,9 @@ static int ca0132_init(struct hda_codec *codec) snd_hda_power_up_pm(codec); + if (spec->quirk == QUIRK_AE5) + ae5_register_set(codec); + ca0132_init_unsol(codec); ca0132_init_params(codec); ca0132_init_flags(codec); @@ -7317,8 +8310,12 @@ static int ca0132_init(struct hda_codec *codec) r3d_setup_defaults(codec); break; case QUIRK_SBZ: + case QUIRK_ZXR: sbz_setup_defaults(codec); break; + case QUIRK_AE5: + ae5_setup_defaults(codec); + break; default: ca0132_setup_defaults(codec); ca0132_init_analog_mic2(codec); @@ -7372,6 +8369,21 @@ static int ca0132_init(struct hda_codec *codec) return 0; } +static int dbpro_init(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + struct auto_pin_cfg *cfg = &spec->autocfg; + unsigned int i; + + init_output(codec, cfg->dig_out_pins[0], spec->dig_out); + init_input(codec, cfg->dig_in_pin, spec->dig_in); + + for (i = 0; i < spec->num_inputs; i++) + init_input(codec, spec->input_pins[i], spec->adcs[i]); + + return 0; +} + static void ca0132_free(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; @@ -7382,9 +8394,15 @@ static void ca0132_free(struct hda_codec *codec) case QUIRK_SBZ: sbz_exit_chip(codec); break; + case QUIRK_ZXR: + zxr_exit_chip(codec); + break; case QUIRK_R3D: r3d_exit_chip(codec); break; + case QUIRK_AE5: + ae5_exit_chip(codec); + break; case QUIRK_R3DI: r3di_gpio_shutdown(codec); break; @@ -7400,6 +8418,16 @@ static void ca0132_free(struct hda_codec *codec) kfree(codec->spec); } +static void dbpro_free(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + zxr_dbpro_power_state_shutdown(codec); + + kfree(spec->spec_init_verbs); + kfree(codec->spec); +} + static void ca0132_reboot_notify(struct hda_codec *codec) { codec->patch_ops.free(codec); @@ -7414,6 +8442,13 @@ static const struct hda_codec_ops ca0132_patch_ops = { .reboot_notify = ca0132_reboot_notify, }; +static const struct hda_codec_ops dbpro_patch_ops = { + .build_controls = dbpro_build_controls, + .build_pcms = dbpro_build_pcms, + .init = dbpro_init, + .free = dbpro_free, +}; + static void ca0132_config(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; @@ -7432,9 +8467,33 @@ static void ca0132_config(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_ALIENWARE: - codec_dbg(codec, "ca0132_config: QUIRK_ALIENWARE applied.\n"); + codec_dbg(codec, "%s: QUIRK_ALIENWARE applied.\n", __func__); snd_hda_apply_pincfgs(codec, alienware_pincfgs); + break; + case QUIRK_SBZ: + codec_dbg(codec, "%s: QUIRK_SBZ applied.\n", __func__); + snd_hda_apply_pincfgs(codec, sbz_pincfgs); + break; + case QUIRK_ZXR: + codec_dbg(codec, "%s: QUIRK_ZXR applied.\n", __func__); + snd_hda_apply_pincfgs(codec, zxr_pincfgs); + break; + case QUIRK_R3D: + codec_dbg(codec, "%s: QUIRK_R3D applied.\n", __func__); + snd_hda_apply_pincfgs(codec, r3d_pincfgs); + break; + case QUIRK_R3DI: + codec_dbg(codec, "%s: QUIRK_R3DI applied.\n", __func__); + snd_hda_apply_pincfgs(codec, r3di_pincfgs); + break; + case QUIRK_AE5: + codec_dbg(codec, "%s: QUIRK_AE5 applied.\n", __func__); + snd_hda_apply_pincfgs(codec, r3di_pincfgs); + break; + } + switch (spec->quirk) { + case QUIRK_ALIENWARE: spec->num_outputs = 2; spec->out_pins[0] = 0x0b; /* speaker out */ spec->out_pins[1] = 0x0f; @@ -7454,15 +8513,6 @@ static void ca0132_config(struct hda_codec *codec) break; case QUIRK_SBZ: case QUIRK_R3D: - if (spec->quirk == QUIRK_SBZ) { - codec_dbg(codec, "%s: QUIRK_SBZ applied.\n", __func__); - snd_hda_apply_pincfgs(codec, sbz_pincfgs); - } - if (spec->quirk == QUIRK_R3D) { - codec_dbg(codec, "%s: QUIRK_R3D applied.\n", __func__); - snd_hda_apply_pincfgs(codec, r3d_pincfgs); - } - spec->num_outputs = 2; spec->out_pins[0] = 0x0B; /* Line out */ spec->out_pins[1] = 0x0F; /* Rear headphone out */ @@ -7487,10 +8537,62 @@ static void ca0132_config(struct hda_codec *codec) spec->multiout.dig_out_nid = spec->dig_out; spec->dig_in = 0x09; break; - case QUIRK_R3DI: - codec_dbg(codec, "%s: QUIRK_R3DI applied.\n", __func__); - snd_hda_apply_pincfgs(codec, r3di_pincfgs); + case QUIRK_ZXR: + spec->num_outputs = 2; + spec->out_pins[0] = 0x0B; /* Line out */ + spec->out_pins[1] = 0x0F; /* Rear headphone out */ + spec->out_pins[2] = 0x10; /* Center/LFE */ + spec->out_pins[3] = 0x11; /* Rear surround */ + spec->shared_out_nid = 0x2; + spec->unsol_tag_hp = spec->out_pins[1]; + spec->unsol_tag_front_hp = spec->out_pins[2]; + spec->adcs[0] = 0x7; /* Rear Mic / Line-in */ + spec->adcs[1] = 0x8; /* Not connected, no front mic */ + spec->adcs[2] = 0xa; /* what u hear */ + + spec->num_inputs = 2; + spec->input_pins[0] = 0x12; /* Rear Mic / Line-in */ + spec->input_pins[1] = 0x13; /* What U Hear */ + spec->shared_mic_nid = 0x7; + spec->unsol_tag_amic1 = spec->input_pins[0]; + break; + case QUIRK_ZXR_DBPRO: + spec->adcs[0] = 0x8; /* ZxR DBPro Aux In */ + + spec->num_inputs = 1; + spec->input_pins[0] = 0x11; /* RCA Line-in */ + + spec->dig_out = 0x05; + spec->multiout.dig_out_nid = spec->dig_out; + + spec->dig_in = 0x09; + break; + case QUIRK_AE5: + spec->num_outputs = 2; + spec->out_pins[0] = 0x0B; /* Line out */ + spec->out_pins[1] = 0x11; /* Rear headphone out */ + spec->out_pins[2] = 0x10; /* Front Headphone / Center/LFE*/ + spec->out_pins[3] = 0x0F; /* Rear surround */ + spec->shared_out_nid = 0x2; + spec->unsol_tag_hp = spec->out_pins[1]; + spec->unsol_tag_front_hp = spec->out_pins[2]; + + spec->adcs[0] = 0x7; /* Rear Mic / Line-in */ + spec->adcs[1] = 0x8; /* Front Mic, but only if no DSP */ + spec->adcs[2] = 0xa; /* what u hear */ + + spec->num_inputs = 2; + spec->input_pins[0] = 0x12; /* Rear Mic / Line-in */ + spec->input_pins[1] = 0x13; /* What U Hear */ + spec->shared_mic_nid = 0x7; + spec->unsol_tag_amic1 = spec->input_pins[0]; + + /* SPDIF I/O */ + spec->dig_out = 0x05; + spec->multiout.dig_out_nid = spec->dig_out; + break; + case QUIRK_R3DI: spec->num_outputs = 2; spec->out_pins[0] = 0x0B; /* Line out */ spec->out_pins[1] = 0x0F; /* Rear headphone out */ @@ -7547,7 +8649,11 @@ static int ca0132_prepare_verbs(struct hda_codec *codec) struct ca0132_spec *spec = codec->spec; spec->chip_init_verbs = ca0132_init_verbs0; - if (spec->quirk == QUIRK_SBZ || spec->quirk == QUIRK_R3D) + /* + * Since desktop cards use pci_mmio, this can be used to determine + * whether or not to use these verbs instead of a separate bool. + */ + if (spec->use_pci_mmio) spec->desktop_init_verbs = ca0132_init_verbs1; spec->spec_init_verbs = kcalloc(NUM_SPEC_VERBS, sizeof(struct hda_verb), @@ -7579,6 +8685,29 @@ static int ca0132_prepare_verbs(struct hda_codec *codec) return 0; } +/* + * The Sound Blaster ZxR shares the same PCI subsystem ID as some regular + * Sound Blaster Z cards. However, they have different HDA codec subsystem + * ID's. So, we check for the ZxR's subsystem ID, as well as the DBPro + * daughter boards ID. + */ +static void sbz_detect_quirk(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + switch (codec->core.subsystem_id) { + case 0x11020033: + spec->quirk = QUIRK_ZXR; + break; + case 0x1102003f: + spec->quirk = QUIRK_ZXR_DBPRO; + break; + default: + spec->quirk = QUIRK_SBZ; + break; + } +} + static int patch_ca0132(struct hda_codec *codec) { struct ca0132_spec *spec; @@ -7593,10 +8722,6 @@ static int patch_ca0132(struct hda_codec *codec) codec->spec = spec; spec->codec = codec; - codec->patch_ops = ca0132_patch_ops; - codec->pcm_format_first = 1; - codec->no_sticky_stream = 1; - /* Detect codec quirk */ quirk = snd_pci_quirk_lookup(codec->bus->pci, ca0132_quirks); if (quirk) @@ -7604,6 +8729,18 @@ static int patch_ca0132(struct hda_codec *codec) else spec->quirk = QUIRK_NONE; + if (spec->quirk == QUIRK_SBZ) + sbz_detect_quirk(codec); + + if (spec->quirk == QUIRK_ZXR_DBPRO) + codec->patch_ops = dbpro_patch_ops; + else + codec->patch_ops = ca0132_patch_ops; + + codec->pcm_format_first = 1; + codec->no_sticky_stream = 1; + + spec->dsp_state = DSP_DOWNLOAD_INIT; spec->num_mixers = 1; @@ -7613,6 +8750,12 @@ static int patch_ca0132(struct hda_codec *codec) spec->mixers[0] = desktop_mixer; snd_hda_codec_set_name(codec, "Sound Blaster Z"); break; + case QUIRK_ZXR: + spec->mixers[0] = desktop_mixer; + snd_hda_codec_set_name(codec, "Sound Blaster ZxR"); + break; + case QUIRK_ZXR_DBPRO: + break; case QUIRK_R3D: spec->mixers[0] = desktop_mixer; snd_hda_codec_set_name(codec, "Recon3D"); @@ -7621,6 +8764,10 @@ static int patch_ca0132(struct hda_codec *codec) spec->mixers[0] = r3di_mixer; snd_hda_codec_set_name(codec, "Recon3Di"); break; + case QUIRK_AE5: + spec->mixers[0] = desktop_mixer; + snd_hda_codec_set_name(codec, "Sound BlasterX AE-5"); + break; default: spec->mixers[0] = ca0132_mixer; break; @@ -7630,6 +8777,8 @@ static int patch_ca0132(struct hda_codec *codec) switch (spec->quirk) { case QUIRK_SBZ: case QUIRK_R3D: + case QUIRK_AE5: + case QUIRK_ZXR: spec->use_alt_controls = true; spec->use_alt_functions = true; spec->use_pci_mmio = true; diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index a7f91be45194..64fa5a82bb9f 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -23,7 +23,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/patch_cmedia.c b/sound/pci/hda/patch_cmedia.c index 1b2195dd2b26..52642ba3e2c0 100644 --- a/sound/pci/hda/patch_cmedia.c +++ b/sound/pci/hda/patch_cmedia.c @@ -25,7 +25,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index cfd4e4f97f8f..950e02e71766 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -27,7 +27,7 @@ #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_beep.h" @@ -943,6 +943,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x17aa, 0x21da, "Lenovo X220", CXT_PINCFG_LENOVO_TP410), SND_PCI_QUIRK(0x17aa, 0x21db, "Lenovo X220-tablet", CXT_PINCFG_LENOVO_TP410), SND_PCI_QUIRK(0x17aa, 0x38af, "Lenovo IdeaPad Z560", CXT_FIXUP_MUTE_LED_EAPD), + SND_PCI_QUIRK(0x17aa, 0x3905, "Lenovo G50-30", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x390b, "Lenovo G50-80", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3975, "Lenovo U300s", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3977, "Lenovo IdeaPad U310", CXT_FIXUP_STEREO_DMIC), diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index cb587dce67a9..67099cbb6be2 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -41,7 +41,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_jack.h" diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 1d117f00d04d..fa61674a5605 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -32,7 +32,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" @@ -6409,6 +6409,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0706, "Dell Inspiron 7559", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE), SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x075c, "Dell XPS 27 7760", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x07b0, "Dell Precision 7520", ALC295_FIXUP_DISABLE_DAC3), SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), @@ -6840,6 +6841,12 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x1a, 0x02a11040}, {0x1b, 0x01014020}, {0x21, 0x0221101f}), + SND_HDA_PIN_QUIRK(0x10ec0235, 0x17aa, "Lenovo", ALC294_FIXUP_LENOVO_MIC_LOCATION, + {0x14, 0x90170110}, + {0x19, 0x02a11030}, + {0x1a, 0x02a11040}, + {0x1b, 0x01011020}, + {0x21, 0x0221101f}), SND_HDA_PIN_QUIRK(0x10ec0235, 0x17aa, "Lenovo", ALC294_FIXUP_LENOVO_MIC_LOCATION, {0x14, 0x90170110}, {0x19, 0x02a11020}, @@ -7737,6 +7744,8 @@ enum { ALC662_FIXUP_ASUS_Nx50, ALC668_FIXUP_ASUS_Nx51_HEADSET_MODE, ALC668_FIXUP_ASUS_Nx51, + ALC668_FIXUP_MIC_COEF, + ALC668_FIXUP_ASUS_G751, ALC891_FIXUP_HEADSET_MODE, ALC891_FIXUP_DELL_MIC_NO_PRESENCE, ALC662_FIXUP_ACER_VERITON, @@ -8006,6 +8015,23 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC668_FIXUP_ASUS_Nx51_HEADSET_MODE, }, + [ALC668_FIXUP_MIC_COEF] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0xc3 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x4000 }, + {} + }, + }, + [ALC668_FIXUP_ASUS_G751] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x16, 0x0421101f }, /* HP */ + {} + }, + .chained = true, + .chain_id = ALC668_FIXUP_MIC_COEF + }, [ALC891_FIXUP_HEADSET_MODE] = { .type = HDA_FIXUP_FUNC, .v.func = alc_fixup_headset_mode, @@ -8079,6 +8105,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_ASUS_Nx50), SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A), SND_PCI_QUIRK(0x1043, 0x129d, "Asus N750", ALC662_FIXUP_ASUS_Nx50), + SND_PCI_QUIRK(0x1043, 0x12ff, "ASUS G751", ALC668_FIXUP_ASUS_G751), SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), @@ -8183,6 +8210,7 @@ static const struct hda_model_fixup alc662_fixup_models[] = { {.id = ALC668_FIXUP_DELL_XPS13, .name = "dell-xps13"}, {.id = ALC662_FIXUP_ASUS_Nx50, .name = "asus-nx50"}, {.id = ALC668_FIXUP_ASUS_Nx51, .name = "asus-nx51"}, + {.id = ALC668_FIXUP_ASUS_G751, .name = "asus-g751"}, {.id = ALC891_FIXUP_HEADSET_MODE, .name = "alc891-headset"}, {.id = ALC891_FIXUP_DELL_MIC_NO_PRESENCE, .name = "alc891-headset-multi"}, {.id = ALC662_FIXUP_ACER_VERITON, .name = "acer-veriton"}, diff --git a/sound/pci/hda/patch_si3054.c b/sound/pci/hda/patch_si3054.c index f63acb1b965c..c49d25bcd7f2 100644 --- a/sound/pci/hda/patch_si3054.c +++ b/sound/pci/hda/patch_si3054.c @@ -27,7 +27,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" /* si3054 verbs */ diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 046705b4691a..1b6ecfb01759 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -32,7 +32,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_beep.h" @@ -77,6 +77,7 @@ enum { STAC_DELL_M6_BOTH, STAC_DELL_EQ, STAC_ALIENWARE_M17X, + STAC_ELO_VUPOINT_15MX, STAC_92HD89XX_HP_FRONT_JACK, STAC_92HD89XX_HP_Z1_G2_RIGHT_MIC_JACK, STAC_92HD73XX_ASUS_MOBO, @@ -1879,6 +1880,18 @@ static void stac92hd73xx_fixup_no_jd(struct hda_codec *codec, codec->no_jack_detect = 1; } + +static void stac92hd73xx_disable_automute(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + struct sigmatel_spec *spec = codec->spec; + + if (action != HDA_FIXUP_ACT_PRE_PROBE) + return; + + spec->gen.suppress_auto_mute = 1; +} + static const struct hda_fixup stac92hd73xx_fixups[] = { [STAC_92HD73XX_REF] = { .type = HDA_FIXUP_FUNC, @@ -1904,6 +1917,10 @@ static const struct hda_fixup stac92hd73xx_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = stac92hd73xx_fixup_alienware_m17x, }, + [STAC_ELO_VUPOINT_15MX] = { + .type = HDA_FIXUP_FUNC, + .v.func = stac92hd73xx_disable_automute, + }, [STAC_92HD73XX_INTEL] = { .type = HDA_FIXUP_PINS, .v.pins = intel_dg45id_pin_configs, @@ -1942,6 +1959,7 @@ static const struct hda_model_fixup stac92hd73xx_models[] = { { .id = STAC_DELL_M6_BOTH, .name = "dell-m6" }, { .id = STAC_DELL_EQ, .name = "dell-eq" }, { .id = STAC_ALIENWARE_M17X, .name = "alienware" }, + { .id = STAC_ELO_VUPOINT_15MX, .name = "elo-vupoint-15mx" }, { .id = STAC_92HD73XX_ASUS_MOBO, .name = "asus-mobo" }, {} }; @@ -1991,6 +2009,8 @@ static const struct snd_pci_quirk stac92hd73xx_fixup_tbl[] = { "Alienware M17x", STAC_ALIENWARE_M17X), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0490, "Alienware M17x R3", STAC_DELL_EQ), + SND_PCI_QUIRK(0x1059, 0x1011, + "ELO VuPoint 15MX", STAC_ELO_VUPOINT_15MX), SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x1927, "HP Z1 G2", STAC_92HD89XX_HP_Z1_G2_RIGHT_MIC_JACK), SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x2b17, diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index 6b9617aee0e6..9f6f13e25145 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -52,7 +52,7 @@ #include #include #include -#include "hda_codec.h" +#include #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" diff --git a/sound/pci/hda/thinkpad_helper.c b/sound/pci/hda/thinkpad_helper.c index 97f49b751e6e..568575b72f2f 100644 --- a/sound/pci/hda/thinkpad_helper.c +++ b/sound/pci/hda/thinkpad_helper.c @@ -58,8 +58,8 @@ static void hda_fixup_thinkpad_acpi(struct hda_codec *codec, removefunc = false; } if (led_set_func(TPACPI_LED_MICMUTE, false) >= 0 && - snd_hda_gen_add_micmute_led(codec, - update_tpacpi_micmute) > 0) + !snd_hda_gen_add_micmute_led(codec, + update_tpacpi_micmute)) removefunc = false; } diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c index 5ee468d1aefe..ffddcdfe0c66 100644 --- a/sound/pci/intel8x0.c +++ b/sound/pci/intel8x0.c @@ -38,11 +38,6 @@ #include #include #include -/* for 440MX workaround */ -#include -#ifdef CONFIG_X86 -#include -#endif MODULE_AUTHOR("Jaroslav Kysela "); MODULE_DESCRIPTION("Intel 82801AA,82901AB,i810,i820,i830,i840,i845,MX440; SiS 7012; Ali 5455"); @@ -374,7 +369,6 @@ struct ichdev { unsigned int ali_slot; /* ALI DMA slot */ struct ac97_pcm *pcm; int pcm_open_flag; - unsigned int page_attr_changed: 1; unsigned int suspended: 1; }; @@ -724,25 +718,6 @@ static void snd_intel8x0_setup_periods(struct intel8x0 *chip, struct ichdev *ich iputbyte(chip, port + ichdev->roff_sr, ICH_FIFOE | ICH_BCIS | ICH_LVBCI); } -#ifdef __i386__ -/* - * Intel 82443MX running a 100MHz processor system bus has a hardware bug, - * which aborts PCI busmaster for audio transfer. A workaround is to set - * the pages as non-cached. For details, see the errata in - * http://download.intel.com/design/chipsets/specupdt/24505108.pdf - */ -static void fill_nocache(void *buf, int size, int nocache) -{ - size = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nocache) - set_pages_uc(virt_to_page(buf), size); - else - set_pages_wb(virt_to_page(buf), size); -} -#else -#define fill_nocache(buf, size, nocache) do { ; } while (0) -#endif - /* * Interrupt handler */ @@ -850,7 +825,7 @@ static int snd_intel8x0_pcm_trigger(struct snd_pcm_substream *substream, int cmd switch (cmd) { case SNDRV_PCM_TRIGGER_RESUME: ichdev->suspended = 0; - /* fallthru */ + /* fall through */ case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: val = ICH_IOCE | ICH_STARTBM; @@ -858,7 +833,7 @@ static int snd_intel8x0_pcm_trigger(struct snd_pcm_substream *substream, int cmd break; case SNDRV_PCM_TRIGGER_SUSPEND: ichdev->suspended = 1; - /* fallthru */ + /* fall through */ case SNDRV_PCM_TRIGGER_STOP: val = 0; break; @@ -892,7 +867,7 @@ static int snd_intel8x0_ali_trigger(struct snd_pcm_substream *substream, int cmd switch (cmd) { case SNDRV_PCM_TRIGGER_RESUME: ichdev->suspended = 0; - /* fallthru */ + /* fall through */ case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { @@ -909,7 +884,7 @@ static int snd_intel8x0_ali_trigger(struct snd_pcm_substream *substream, int cmd break; case SNDRV_PCM_TRIGGER_SUSPEND: ichdev->suspended = 1; - /* fallthru */ + /* fall through */ case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: /* pause */ @@ -938,23 +913,12 @@ static int snd_intel8x0_hw_params(struct snd_pcm_substream *substream, { struct intel8x0 *chip = snd_pcm_substream_chip(substream); struct ichdev *ichdev = get_ichdev(substream); - struct snd_pcm_runtime *runtime = substream->runtime; int dbl = params_rate(hw_params) > 48000; int err; - if (chip->fix_nocache && ichdev->page_attr_changed) { - fill_nocache(runtime->dma_area, runtime->dma_bytes, 0); /* clear */ - ichdev->page_attr_changed = 0; - } err = snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(hw_params)); if (err < 0) return err; - if (chip->fix_nocache) { - if (runtime->dma_area && ! ichdev->page_attr_changed) { - fill_nocache(runtime->dma_area, runtime->dma_bytes, 1); - ichdev->page_attr_changed = 1; - } - } if (ichdev->pcm_open_flag) { snd_ac97_pcm_close(ichdev->pcm); ichdev->pcm_open_flag = 0; @@ -974,17 +938,12 @@ static int snd_intel8x0_hw_params(struct snd_pcm_substream *substream, static int snd_intel8x0_hw_free(struct snd_pcm_substream *substream) { - struct intel8x0 *chip = snd_pcm_substream_chip(substream); struct ichdev *ichdev = get_ichdev(substream); if (ichdev->pcm_open_flag) { snd_ac97_pcm_close(ichdev->pcm); ichdev->pcm_open_flag = 0; } - if (chip->fix_nocache && ichdev->page_attr_changed) { - fill_nocache(substream->runtime->dma_area, substream->runtime->dma_bytes, 0); - ichdev->page_attr_changed = 0; - } return snd_pcm_lib_free_pages(substream); } @@ -1510,6 +1469,9 @@ struct ich_pcm_table { int ac97_idx; }; +#define intel8x0_dma_type(chip) \ + ((chip)->fix_nocache ? SNDRV_DMA_TYPE_DEV_UC : SNDRV_DMA_TYPE_DEV) + static int snd_intel8x0_pcm1(struct intel8x0 *chip, int device, struct ich_pcm_table *rec) { @@ -1540,7 +1502,7 @@ static int snd_intel8x0_pcm1(struct intel8x0 *chip, int device, strcpy(pcm->name, chip->card->shortname); chip->pcm[device] = pcm; - snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV, + snd_pcm_lib_preallocate_pages_for_all(pcm, intel8x0_dma_type(chip), snd_dma_pci_data(chip->pci), rec->prealloc_size, rec->prealloc_max_size); @@ -2629,11 +2591,8 @@ static int snd_intel8x0_free(struct intel8x0 *chip) __hw_end: if (chip->irq >= 0) free_irq(chip->irq, chip); - if (chip->bdbars.area) { - if (chip->fix_nocache) - fill_nocache(chip->bdbars.area, chip->bdbars.bytes, 0); + if (chip->bdbars.area) snd_dma_free_pages(&chip->bdbars); - } if (chip->addr) pci_iounmap(chip->pci, chip->addr); if (chip->bmaddr) @@ -2657,17 +2616,6 @@ static int intel8x0_suspend(struct device *dev) snd_power_change_state(card, SNDRV_CTL_POWER_D3hot); for (i = 0; i < chip->pcm_devs; i++) snd_pcm_suspend_all(chip->pcm[i]); - /* clear nocache */ - if (chip->fix_nocache) { - for (i = 0; i < chip->bdbars_count; i++) { - struct ichdev *ichdev = &chip->ichd[i]; - if (ichdev->substream && ichdev->page_attr_changed) { - struct snd_pcm_runtime *runtime = ichdev->substream->runtime; - if (runtime->dma_area) - fill_nocache(runtime->dma_area, runtime->dma_bytes, 0); - } - } - } for (i = 0; i < chip->ncodecs; i++) snd_ac97_suspend(chip->ac97[i]); if (chip->device_type == DEVICE_INTEL_ICH4) @@ -2708,25 +2656,9 @@ static int intel8x0_resume(struct device *dev) ICH_PCM_SPDIF_1011); } - /* refill nocache */ - if (chip->fix_nocache) - fill_nocache(chip->bdbars.area, chip->bdbars.bytes, 1); - for (i = 0; i < chip->ncodecs; i++) snd_ac97_resume(chip->ac97[i]); - /* refill nocache */ - if (chip->fix_nocache) { - for (i = 0; i < chip->bdbars_count; i++) { - struct ichdev *ichdev = &chip->ichd[i]; - if (ichdev->substream && ichdev->page_attr_changed) { - struct snd_pcm_runtime *runtime = ichdev->substream->runtime; - if (runtime->dma_area) - fill_nocache(runtime->dma_area, runtime->dma_bytes, 1); - } - } - } - /* resume status */ for (i = 0; i < chip->bdbars_count; i++) { struct ichdev *ichdev = &chip->ichd[i]; @@ -3057,6 +2989,12 @@ static int snd_intel8x0_create(struct snd_card *card, chip->inside_vm = snd_intel8x0_inside_vm(pci); + /* + * Intel 82443MX running a 100MHz processor system bus has a hardware + * bug, which aborts PCI busmaster for audio transfer. A workaround + * is to set the pages as non-cached. For details, see the errata in + * http://download.intel.com/design/chipsets/specupdt/24505108.pdf + */ if (pci->vendor == PCI_VENDOR_ID_INTEL && pci->device == PCI_DEVICE_ID_INTEL_440MX) chip->fix_nocache = 1; /* enable workaround */ @@ -3128,7 +3066,7 @@ static int snd_intel8x0_create(struct snd_card *card, /* allocate buffer descriptor lists */ /* the start of each lists must be aligned to 8 bytes */ - if (snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, snd_dma_pci_data(pci), + if (snd_dma_alloc_pages(intel8x0_dma_type(chip), snd_dma_pci_data(pci), chip->bdbars_count * sizeof(u32) * ICH_MAX_FRAGS * 2, &chip->bdbars) < 0) { snd_intel8x0_free(chip); @@ -3137,9 +3075,6 @@ static int snd_intel8x0_create(struct snd_card *card, } /* tables must be aligned to 8 bytes here, but the kernel pages are much bigger, so we don't care (on i386) */ - /* workaround for 440MX */ - if (chip->fix_nocache) - fill_nocache(chip->bdbars.area, chip->bdbars.bytes, 1); int_sta_masks = 0; for (i = 0; i < chip->bdbars_count; i++) { ichdev = &chip->ichd[i]; diff --git a/sound/pci/intel8x0m.c b/sound/pci/intel8x0m.c index 943a726b1c1b..c84629190cba 100644 --- a/sound/pci/intel8x0m.c +++ b/sound/pci/intel8x0m.c @@ -1171,16 +1171,6 @@ static int snd_intel8x0m_create(struct snd_card *card, } port_inited: - if (request_irq(pci->irq, snd_intel8x0m_interrupt, IRQF_SHARED, - KBUILD_MODNAME, chip)) { - dev_err(card->dev, "unable to grab IRQ %d\n", pci->irq); - snd_intel8x0m_free(chip); - return -EBUSY; - } - chip->irq = pci->irq; - pci_set_master(pci); - synchronize_irq(chip->irq); - /* initialize offsets */ chip->bdbars_count = 2; tbl = intel_regs; @@ -1224,11 +1214,21 @@ static int snd_intel8x0m_create(struct snd_card *card, chip->int_sta_reg = ICH_REG_GLOB_STA; chip->int_sta_mask = int_sta_masks; + pci_set_master(pci); + if ((err = snd_intel8x0m_chip_init(chip, 1)) < 0) { snd_intel8x0m_free(chip); return err; } + if (request_irq(pci->irq, snd_intel8x0m_interrupt, IRQF_SHARED, + KBUILD_MODNAME, chip)) { + dev_err(card->dev, "unable to grab IRQ %d\n", pci->irq); + snd_intel8x0m_free(chip); + return -EBUSY; + } + chip->irq = pci->irq; + if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops)) < 0) { snd_intel8x0m_free(chip); return err; diff --git a/sound/pci/rme32.c b/sound/pci/rme32.c index f0906ba416d4..3ac8c71d567c 100644 --- a/sound/pci/rme32.c +++ b/sound/pci/rme32.c @@ -319,7 +319,8 @@ static const struct snd_pcm_hardware snd_rme32_spdif_info = { SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_PAUSE | - SNDRV_PCM_INFO_SYNC_START), + SNDRV_PCM_INFO_SYNC_START | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE), .rates = (SNDRV_PCM_RATE_32000 | @@ -346,7 +347,8 @@ static const struct snd_pcm_hardware snd_rme32_adat_info = SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_PAUSE | - SNDRV_PCM_INFO_SYNC_START), + SNDRV_PCM_INFO_SYNC_START | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats= SNDRV_PCM_FMTBIT_S16_LE, .rates = (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000), @@ -370,7 +372,8 @@ static const struct snd_pcm_hardware snd_rme32_spdif_fd_info = { SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_PAUSE | - SNDRV_PCM_INFO_SYNC_START), + SNDRV_PCM_INFO_SYNC_START | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats = (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE), .rates = (SNDRV_PCM_RATE_32000 | @@ -397,7 +400,8 @@ static const struct snd_pcm_hardware snd_rme32_adat_fd_info = SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_PAUSE | - SNDRV_PCM_INFO_SYNC_START), + SNDRV_PCM_INFO_SYNC_START | + SNDRV_PCM_INFO_SYNC_APPLPTR), .formats= SNDRV_PCM_FMTBIT_S16_LE, .rates = (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000), @@ -1104,16 +1108,6 @@ snd_rme32_pcm_trigger(struct snd_pcm_substream *substream, int cmd) snd_pcm_trigger_done(s, substream); } - /* prefill playback buffer */ - if (cmd == SNDRV_PCM_TRIGGER_START && rme32->fullduplex_mode) { - snd_pcm_group_for_each_entry(s, substream) { - if (s == rme32->playback_substream) { - s->ops->ack(s); - break; - } - } - } - switch (cmd) { case SNDRV_PCM_TRIGGER_START: if (rme32->running && ! RME32_ISWORKING(rme32)) diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c index 11b5b5e0e058..679ad0415e3b 100644 --- a/sound/pci/rme9652/hdspm.c +++ b/sound/pci/rme9652/hdspm.c @@ -6534,7 +6534,7 @@ static int snd_hdspm_create_alsa_devices(struct snd_card *card, dev_dbg(card->dev, "Update mixer controls...\n"); hdspm_update_simple_mixer_controls(hdspm); - dev_dbg(card->dev, "Initializeing complete ???\n"); + dev_dbg(card->dev, "Initializing complete?\n"); err = snd_card_register(card); if (err < 0) { diff --git a/sound/soc/amd/acp-da7219-max98357a.c b/sound/soc/amd/acp-da7219-max98357a.c index 8e3275a96a82..3f813ea5210a 100644 --- a/sound/soc/amd/acp-da7219-max98357a.c +++ b/sound/soc/amd/acp-da7219-max98357a.c @@ -42,7 +42,7 @@ #include "../codecs/da7219.h" #include "../codecs/da7219-aad.h" -#define CZ_PLAT_CLK 25000000 +#define CZ_PLAT_CLK 48000000 #define DUAL_CHANNEL 2 static struct snd_soc_jack cz_jack; @@ -75,7 +75,7 @@ static int cz_da7219_init(struct snd_soc_pcm_runtime *rtd) da7219_dai_clk = clk_get(component->dev, "da7219-dai-clks"); ret = snd_soc_card_jack_new(card, "Headset Jack", - SND_JACK_HEADPHONE | SND_JACK_MICROPHONE | + SND_JACK_HEADSET | SND_JACK_LINEOUT | SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2 | SND_JACK_BTN_3, &cz_jack, NULL, 0); @@ -133,7 +133,7 @@ static const struct snd_pcm_hw_constraint_list constraints_channels = { .mask = 0, }; -static int cz_da7219_startup(struct snd_pcm_substream *substream) +static int cz_da7219_play_startup(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_soc_pcm_runtime *rtd = substream->private_data; @@ -150,7 +150,28 @@ static int cz_da7219_startup(struct snd_pcm_substream *substream) snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, &constraints_rates); - machine->i2s_instance = I2S_SP_INSTANCE; + machine->play_i2s_instance = I2S_SP_INSTANCE; + return da7219_clk_enable(substream); +} + +static int cz_da7219_cap_startup(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_card *card = rtd->card; + struct acp_platform_info *machine = snd_soc_card_get_drvdata(card); + + /* + * On this platform for PCM device we support stereo + */ + + runtime->hw.channels_max = DUAL_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels); + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, + &constraints_rates); + + machine->cap_i2s_instance = I2S_SP_INSTANCE; machine->capture_channel = CAP_CHANNEL1; return da7219_clk_enable(substream); } @@ -162,11 +183,22 @@ static void cz_da7219_shutdown(struct snd_pcm_substream *substream) static int cz_max_startup(struct snd_pcm_substream *substream) { + struct snd_pcm_runtime *runtime = substream->runtime; struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_card *card = rtd->card; struct acp_platform_info *machine = snd_soc_card_get_drvdata(card); - machine->i2s_instance = I2S_BT_INSTANCE; + /* + * On this platform for PCM device we support stereo + */ + + runtime->hw.channels_max = DUAL_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels); + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, + &constraints_rates); + + machine->play_i2s_instance = I2S_BT_INSTANCE; return da7219_clk_enable(substream); } @@ -177,21 +209,43 @@ static void cz_max_shutdown(struct snd_pcm_substream *substream) static int cz_dmic0_startup(struct snd_pcm_substream *substream) { + struct snd_pcm_runtime *runtime = substream->runtime; struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_card *card = rtd->card; struct acp_platform_info *machine = snd_soc_card_get_drvdata(card); - machine->i2s_instance = I2S_BT_INSTANCE; + /* + * On this platform for PCM device we support stereo + */ + + runtime->hw.channels_max = DUAL_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels); + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, + &constraints_rates); + + machine->cap_i2s_instance = I2S_BT_INSTANCE; return da7219_clk_enable(substream); } static int cz_dmic1_startup(struct snd_pcm_substream *substream) { + struct snd_pcm_runtime *runtime = substream->runtime; struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_card *card = rtd->card; struct acp_platform_info *machine = snd_soc_card_get_drvdata(card); - machine->i2s_instance = I2S_SP_INSTANCE; + /* + * On this platform for PCM device we support stereo + */ + + runtime->hw.channels_max = DUAL_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels); + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, + &constraints_rates); + + machine->cap_i2s_instance = I2S_SP_INSTANCE; machine->capture_channel = CAP_CHANNEL0; return da7219_clk_enable(substream); } @@ -201,8 +255,13 @@ static void cz_dmic_shutdown(struct snd_pcm_substream *substream) da7219_clk_disable(); } +static const struct snd_soc_ops cz_da7219_play_ops = { + .startup = cz_da7219_play_startup, + .shutdown = cz_da7219_shutdown, +}; + static const struct snd_soc_ops cz_da7219_cap_ops = { - .startup = cz_da7219_startup, + .startup = cz_da7219_cap_startup, .shutdown = cz_da7219_shutdown, }; @@ -233,7 +292,7 @@ static struct snd_soc_dai_link cz_dai_7219_98357[] = { | SND_SOC_DAIFMT_CBM_CFM, .init = cz_da7219_init, .dpcm_playback = 1, - .ops = &cz_da7219_cap_ops, + .ops = &cz_da7219_play_ops, }, { .name = "amd-da7219-cap", diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c index 77b265bd0505..cdebab2f8ce5 100644 --- a/sound/soc/amd/acp-pcm-dma.c +++ b/sound/soc/amd/acp-pcm-dma.c @@ -867,8 +867,12 @@ static int acp_dma_hw_params(struct snd_pcm_substream *substream, return -EINVAL; if (pinfo) { - rtd->i2s_instance = pinfo->i2s_instance; - rtd->capture_channel = pinfo->capture_channel; + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { + rtd->i2s_instance = pinfo->play_i2s_instance; + } else { + rtd->i2s_instance = pinfo->cap_i2s_instance; + rtd->capture_channel = pinfo->capture_channel; + } } if (adata->asic_type == CHIP_STONEY) { val = acp_reg_read(adata->acp_mmio, @@ -1036,16 +1040,22 @@ static snd_pcm_uframes_t acp_dma_pointer(struct snd_pcm_substream *substream) if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { period_bytes = frames_to_bytes(runtime, runtime->period_size); - dscr = acp_reg_read(rtd->acp_mmio, rtd->dma_curr_dscr); - if (dscr == rtd->dma_dscr_idx_1) - pos = period_bytes; - else - pos = 0; bytescount = acp_get_byte_count(rtd); - if (bytescount > rtd->bytescount) + if (bytescount >= rtd->bytescount) bytescount -= rtd->bytescount; - delay = do_div(bytescount, period_bytes); - runtime->delay = bytes_to_frames(runtime, delay); + if (bytescount < period_bytes) { + pos = 0; + } else { + dscr = acp_reg_read(rtd->acp_mmio, rtd->dma_curr_dscr); + if (dscr == rtd->dma_dscr_idx_1) + pos = period_bytes; + else + pos = 0; + } + if (bytescount > 0) { + delay = do_div(bytescount, period_bytes); + runtime->delay = bytes_to_frames(runtime, delay); + } } else { buffersize = frames_to_bytes(runtime, runtime->buffer_size); bytescount = acp_get_byte_count(rtd); diff --git a/sound/soc/amd/acp.h b/sound/soc/amd/acp.h index be3963e8f4fa..dbbb1a85638d 100644 --- a/sound/soc/amd/acp.h +++ b/sound/soc/amd/acp.h @@ -158,7 +158,8 @@ struct audio_drv_data { * and dma driver */ struct acp_platform_info { - u16 i2s_instance; + u16 play_i2s_instance; + u16 cap_i2s_instance; u16 capture_channel; }; diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig index 64b784e96f84..64f86f0b87e5 100644 --- a/sound/soc/atmel/Kconfig +++ b/sound/soc/atmel/Kconfig @@ -97,4 +97,16 @@ config SND_ATMEL_SOC_I2S help Say Y or M if you want to add support for Atmel ASoc driver for boards using I2S. + +config SND_SOC_MIKROE_PROTO + tristate "Support for Mikroe-PROTO board" + depends on OF + depends on SND_SOC_I2C_AND_SPI + select SND_SOC_WM8731 + help + Say Y or M if you want to add support for MikroElektronika PROTO Audio + Board. This board contains the WM8731 codec, which can be configured + using I2C over SDA (MPU Data Input) and SCL (MPU Clock Input) pins. + Both playback and capture are supported. + endif diff --git a/sound/soc/atmel/Makefile b/sound/soc/atmel/Makefile index cd87cb4bcff5..9f41bfa0fea3 100644 --- a/sound/soc/atmel/Makefile +++ b/sound/soc/atmel/Makefile @@ -17,6 +17,7 @@ snd-soc-sam9x5-wm8731-objs := sam9x5_wm8731.o snd-atmel-soc-classd-objs := atmel-classd.o snd-atmel-soc-pdmic-objs := atmel-pdmic.o snd-atmel-soc-tse850-pcm5142-objs := tse850-pcm5142.o +snd-soc-mikroe-proto-objs := mikroe-proto.o obj-$(CONFIG_SND_AT91_SOC_SAM9G20_WM8731) += snd-soc-sam9g20-wm8731.o obj-$(CONFIG_SND_ATMEL_SOC_WM8904) += snd-atmel-soc-wm8904.o @@ -24,3 +25,4 @@ obj-$(CONFIG_SND_AT91_SOC_SAM9X5_WM8731) += snd-soc-sam9x5-wm8731.o obj-$(CONFIG_SND_ATMEL_SOC_CLASSD) += snd-atmel-soc-classd.o obj-$(CONFIG_SND_ATMEL_SOC_PDMIC) += snd-atmel-soc-pdmic.o obj-$(CONFIG_SND_ATMEL_SOC_TSE850_PCM5142) += snd-atmel-soc-tse850-pcm5142.o +obj-$(CONFIG_SND_SOC_MIKROE_PROTO) += snd-soc-mikroe-proto.o diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c index d3b69682d9c2..6291ec7f9dd6 100644 --- a/sound/soc/atmel/atmel_ssc_dai.c +++ b/sound/soc/atmel/atmel_ssc_dai.c @@ -1005,11 +1005,11 @@ static int asoc_ssc_init(struct device *dev) struct ssc_device *ssc = dev_get_drvdata(dev); int ret; - ret = snd_soc_register_component(dev, &atmel_ssc_component, + ret = devm_snd_soc_register_component(dev, &atmel_ssc_component, &atmel_ssc_dai, 1); if (ret) { dev_err(dev, "Could not register DAI: %d\n", ret); - goto err; + return ret; } if (ssc->pdata->use_dma) @@ -1019,15 +1019,10 @@ static int asoc_ssc_init(struct device *dev) if (ret) { dev_err(dev, "Could not register PCM: %d\n", ret); - goto err_unregister_dai; + return ret; } return 0; - -err_unregister_dai: - snd_soc_unregister_component(dev); -err: - return ret; } static void asoc_ssc_exit(struct device *dev) @@ -1038,8 +1033,6 @@ static void asoc_ssc_exit(struct device *dev) atmel_pcm_dma_platform_unregister(dev); else atmel_pcm_pdc_platform_unregister(dev); - - snd_soc_unregister_component(dev); } /** diff --git a/sound/soc/atmel/mikroe-proto.c b/sound/soc/atmel/mikroe-proto.c new file mode 100644 index 000000000000..d47aaa5bf75a --- /dev/null +++ b/sound/soc/atmel/mikroe-proto.c @@ -0,0 +1,165 @@ +/* + * ASoC driver for PROTO AudioCODEC (with a WM8731) + * + * Author: Florian Meier, + * Copyright 2013 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include +#include +#include + +#include "../codecs/wm8731.h" + +#define XTAL_RATE 12288000 /* This is fixed on this board */ + +static int snd_proto_init(struct snd_soc_pcm_runtime *rtd) +{ + struct snd_soc_card *card = rtd->card; + struct snd_soc_dai *codec_dai = rtd->codec_dai; + + /* Set proto sysclk */ + int ret = snd_soc_dai_set_sysclk(codec_dai, WM8731_SYSCLK_XTAL, + XTAL_RATE, SND_SOC_CLOCK_IN); + if (ret < 0) { + dev_err(card->dev, "Failed to set WM8731 SYSCLK: %d\n", + ret); + return ret; + } + + return 0; +} + +static const struct snd_soc_dapm_widget snd_proto_widget[] = { + SND_SOC_DAPM_MIC("Microphone Jack", NULL), + SND_SOC_DAPM_HP("Headphone Jack", NULL), +}; + +static const struct snd_soc_dapm_route snd_proto_route[] = { + /* speaker connected to LHPOUT/RHPOUT */ + {"Headphone Jack", NULL, "LHPOUT"}, + {"Headphone Jack", NULL, "RHPOUT"}, + + /* mic is connected to Mic Jack, with WM8731 Mic Bias */ + {"MICIN", NULL, "Mic Bias"}, + {"Mic Bias", NULL, "Microphone Jack"}, +}; + +/* audio machine driver */ +static struct snd_soc_card snd_proto = { + .name = "snd_mikroe_proto", + .owner = THIS_MODULE, + .dapm_widgets = snd_proto_widget, + .num_dapm_widgets = ARRAY_SIZE(snd_proto_widget), + .dapm_routes = snd_proto_route, + .num_dapm_routes = ARRAY_SIZE(snd_proto_route), +}; + +static int snd_proto_probe(struct platform_device *pdev) +{ + struct snd_soc_dai_link *dai; + struct device_node *np = pdev->dev.of_node; + struct device_node *codec_np, *cpu_np; + struct device_node *bitclkmaster = NULL; + struct device_node *framemaster = NULL; + unsigned int dai_fmt; + int ret = 0; + + if (!np) { + dev_err(&pdev->dev, "No device node supplied\n"); + return -EINVAL; + } + + snd_proto.dev = &pdev->dev; + ret = snd_soc_of_parse_card_name(&snd_proto, "model"); + if (ret) + return ret; + + dai = devm_kzalloc(&pdev->dev, sizeof(*dai), GFP_KERNEL); + if (!dai) + return -ENOMEM; + + snd_proto.dai_link = dai; + snd_proto.num_links = 1; + + dai->name = "WM8731"; + dai->stream_name = "WM8731 HiFi"; + dai->codec_dai_name = "wm8731-hifi"; + dai->init = &snd_proto_init; + + codec_np = of_parse_phandle(np, "audio-codec", 0); + if (!codec_np) { + dev_err(&pdev->dev, "audio-codec node missing\n"); + return -EINVAL; + } + dai->codec_of_node = codec_np; + + cpu_np = of_parse_phandle(np, "i2s-controller", 0); + if (!cpu_np) { + dev_err(&pdev->dev, "i2s-controller missing\n"); + return -EINVAL; + } + dai->cpu_of_node = cpu_np; + dai->platform_of_node = cpu_np; + + dai_fmt = snd_soc_of_parse_daifmt(np, NULL, + &bitclkmaster, &framemaster); + if (bitclkmaster != framemaster) { + dev_err(&pdev->dev, "Must be the same bitclock and frame master\n"); + return -EINVAL; + } + if (bitclkmaster) { + dai_fmt &= ~SND_SOC_DAIFMT_MASTER_MASK; + if (codec_np == bitclkmaster) + dai_fmt |= SND_SOC_DAIFMT_CBM_CFM; + else + dai_fmt |= SND_SOC_DAIFMT_CBS_CFS; + } + of_node_put(bitclkmaster); + of_node_put(framemaster); + dai->dai_fmt = dai_fmt; + + of_node_put(codec_np); + of_node_put(cpu_np); + + ret = snd_soc_register_card(&snd_proto); + if (ret && ret != -EPROBE_DEFER) + dev_err(&pdev->dev, + "snd_soc_register_card() failed: %d\n", ret); + + return ret; +} + +static int snd_proto_remove(struct platform_device *pdev) +{ + return snd_soc_unregister_card(&snd_proto); +} + +static const struct of_device_id snd_proto_of_match[] = { + { .compatible = "mikroe,mikroe-proto", }, + {}, +}; +MODULE_DEVICE_TABLE(of, snd_proto_of_match); + +static struct platform_driver snd_proto_driver = { + .driver = { + .name = "snd-mikroe-proto", + .of_match_table = snd_proto_of_match, + }, + .probe = snd_proto_probe, + .remove = snd_proto_remove, +}; + +module_platform_driver(snd_proto_driver); + +MODULE_AUTHOR("Florian Meier"); +MODULE_DESCRIPTION("ASoC Driver for PROTO board (WM8731)"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/atmel/tse850-pcm5142.c b/sound/soc/atmel/tse850-pcm5142.c index 3a1393283156..214adcad5419 100644 --- a/sound/soc/atmel/tse850-pcm5142.c +++ b/sound/soc/atmel/tse850-pcm5142.c @@ -1,44 +1,38 @@ -/* - * TSE-850 audio - ASoC driver for the Axentia TSE-850 with a PCM5142 codec - * - * Copyright (C) 2016 Axentia Technologies AB - * - * Author: Peter Rosin - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * loop1 relays - * IN1 +---o +------------+ o---+ OUT1 - * \ / - * + + - * | / | - * +--o +--. | - * | add | | - * | V | - * | .---. | - * DAC +----------->|Sum|---+ - * | '---' | - * | | - * + + - * - * IN2 +---o--+------------+--o---+ OUT2 - * loop2 relays - * - * The 'loop1' gpio pin controlls two relays, which are either in loop - * position, meaning that input and output are directly connected, or - * they are in mixer position, meaning that the signal is passed through - * the 'Sum' mixer. Similarly for 'loop2'. - * - * In the above, the 'loop1' relays are inactive, thus feeding IN1 to the - * mixer (if 'add' is active) and feeding the mixer output to OUT1. The - * 'loop2' relays are active, short-cutting the TSE-850 from channel 2. - * IN1, IN2, OUT1 and OUT2 are TSE-850 connectors and DAC is the PCB name - * of the (filtered) output from the PCM5142 codec. - */ +// SPDX-License-Identifier: GPL-2.0 +// +// TSE-850 audio - ASoC driver for the Axentia TSE-850 with a PCM5142 codec +// +// Copyright (C) 2016 Axentia Technologies AB +// +// Author: Peter Rosin +// +// loop1 relays +// IN1 +---o +------------+ o---+ OUT1 +// \ / +// + + +// | / | +// +--o +--. | +// | add | | +// | V | +// | .---. | +// DAC +----------->|Sum|---+ +// | '---' | +// | | +// + + +// +// IN2 +---o--+------------+--o---+ OUT2 +// loop2 relays +// +// The 'loop1' gpio pin controlls two relays, which are either in loop +// position, meaning that input and output are directly connected, or +// they are in mixer position, meaning that the signal is passed through +// the 'Sum' mixer. Similarly for 'loop2'. +// +// In the above, the 'loop1' relays are inactive, thus feeding IN1 to the +// mixer (if 'add' is active) and feeding the mixer output to OUT1. The +// 'loop2' relays are active, short-cutting the TSE-850 from channel 2. +// IN1, IN2, OUT1 and OUT2 are TSE-850 connectors and DAC is the PCB name +// of the (filtered) output from the PCM5142 codec. #include #include @@ -452,4 +446,4 @@ module_platform_driver(tse850_driver); /* Module information */ MODULE_AUTHOR("Peter Rosin "); MODULE_DESCRIPTION("ALSA SoC driver for TSE-850 with PCM5142 codec"); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/bcm/cygnus-ssp.c b/sound/soc/bcm/cygnus-ssp.c index b733f1446353..b7c358b48d8d 100644 --- a/sound/soc/bcm/cygnus-ssp.c +++ b/sound/soc/bcm/cygnus-ssp.c @@ -1334,7 +1334,7 @@ static int cygnus_ssp_probe(struct platform_device *pdev) cygaud->active_ports = 0; dev_dbg(dev, "Registering %d DAIs\n", active_port_count); - err = snd_soc_register_component(dev, &cygnus_ssp_component, + err = devm_snd_soc_register_component(dev, &cygnus_ssp_component, cygnus_ssp_dai, active_port_count); if (err) { dev_err(dev, "snd_soc_register_dai failed\n"); @@ -1345,32 +1345,27 @@ static int cygnus_ssp_probe(struct platform_device *pdev) if (cygaud->irq_num <= 0) { dev_err(dev, "platform_get_irq failed\n"); err = cygaud->irq_num; - goto err_irq; + return err; } err = audio_clk_init(pdev, cygaud); if (err) { dev_err(dev, "audio clock initialization failed\n"); - goto err_irq; + return err; } err = cygnus_soc_platform_register(dev, cygaud); if (err) { dev_err(dev, "platform reg error %d\n", err); - goto err_irq; + return err; } return 0; - -err_irq: - snd_soc_unregister_component(dev); - return err; } static int cygnus_ssp_remove(struct platform_device *pdev) { cygnus_soc_platform_unregister(&pdev->dev); - snd_soc_unregister_component(&pdev->dev); return 0; } diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index efb095dbcd71..9cc4f1848c9b 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -82,6 +82,7 @@ config SND_SOC_ALL_CODECS select SND_SOC_ES7241 select SND_SOC_GTM601 select SND_SOC_HDAC_HDMI + select SND_SOC_HDAC_HDA select SND_SOC_ICS43432 select SND_SOC_INNO_RK3036 select SND_SOC_ISABELLE if I2C @@ -109,6 +110,7 @@ config SND_SOC_ALL_CODECS select SND_SOC_MT6351 if MTK_PMIC_WRAP select SND_SOC_NAU8540 if I2C select SND_SOC_NAU8810 if I2C + select SND_SOC_NAU8822 if I2C select SND_SOC_NAU8824 if I2C select SND_SOC_NAU8825 if I2C select SND_SOC_HDMI_CODEC @@ -119,6 +121,8 @@ config SND_SOC_ALL_CODECS select SND_SOC_PCM186X_I2C if I2C select SND_SOC_PCM186X_SPI if SPI_MASTER select SND_SOC_PCM3008 + select SND_SOC_PCM3060_I2C if I2C + select SND_SOC_PCM3060_SPI if SPI_MASTER select SND_SOC_PCM3168A_I2C if I2C select SND_SOC_PCM3168A_SPI if SPI_MASTER select SND_SOC_PCM5102A @@ -575,7 +579,11 @@ config SND_SOC_DA9055 tristate config SND_SOC_DMIC - tristate + tristate "Generic Digital Microphone CODEC" + depends on GPIOLIB + help + Enable support for the Generic Digital Microphone CODEC. + Select this if your sound card has DMICs. config SND_SOC_HDMI_CODEC tristate @@ -615,6 +623,10 @@ config SND_SOC_HDAC_HDMI select SND_PCM_ELD select HDMI +config SND_SOC_HDAC_HDA + tristate + select SND_HDA + config SND_SOC_ICS43432 tristate @@ -629,7 +641,8 @@ config SND_SOC_LM49453 tristate config SND_SOC_MAX98088 - tristate + tristate "Maxim MAX98088/9 Low-Power, Stereo Audio Codec" + depends on I2C config SND_SOC_MAX98090 tristate @@ -732,6 +745,21 @@ config SND_SOC_PCM186X_SPI config SND_SOC_PCM3008 tristate +config SND_SOC_PCM3060 + tristate + +config SND_SOC_PCM3060_I2C + tristate "Texas Instruments PCM3060 CODEC - I2C" + depends on I2C + select SND_SOC_PCM3060 + select REGMAP_I2C + +config SND_SOC_PCM3060_SPI + tristate "Texas Instruments PCM3060 CODEC - SPI" + depends on SPI_MASTER + select SND_SOC_PCM3060 + select REGMAP_SPI + config SND_SOC_PCM3168A tristate @@ -1299,6 +1327,10 @@ config SND_SOC_NAU8810 tristate "Nuvoton Technology Corporation NAU88C10 CODEC" depends on I2C +config SND_SOC_NAU8822 + tristate "Nuvoton Technology Corporation NAU88C22 CODEC" + depends on I2C + config SND_SOC_NAU8824 tristate "Nuvoton Technology Corporation NAU88L24 CODEC" depends on I2C diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index 7ae7c85e8219..8ffab8c8dbfa 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -78,6 +78,7 @@ snd-soc-es8328-i2c-objs := es8328-i2c.o snd-soc-es8328-spi-objs := es8328-spi.o snd-soc-gtm601-objs := gtm601.o snd-soc-hdac-hdmi-objs := hdac_hdmi.o +snd-soc-hdac-hda-objs := hdac_hda.o snd-soc-ics43432-objs := ics43432.o snd-soc-inno-rk3036-objs := inno_rk3036.o snd-soc-isabelle-objs := isabelle.o @@ -106,6 +107,7 @@ snd-soc-msm8916-digital-objs := msm8916-wcd-digital.o snd-soc-mt6351-objs := mt6351.o snd-soc-nau8540-objs := nau8540.o snd-soc-nau8810-objs := nau8810.o +snd-soc-nau8822-objs := nau8822.o snd-soc-nau8824-objs := nau8824.o snd-soc-nau8825-objs := nau8825.o snd-soc-hdmi-codec-objs := hdmi-codec.o @@ -119,6 +121,9 @@ snd-soc-pcm186x-objs := pcm186x.o snd-soc-pcm186x-i2c-objs := pcm186x-i2c.o snd-soc-pcm186x-spi-objs := pcm186x-spi.o snd-soc-pcm3008-objs := pcm3008.o +snd-soc-pcm3060-objs := pcm3060.o +snd-soc-pcm3060-i2c-objs := pcm3060-i2c.o +snd-soc-pcm3060-spi-objs := pcm3060-spi.o snd-soc-pcm3168a-objs := pcm3168a.o snd-soc-pcm3168a-i2c-objs := pcm3168a-i2c.o snd-soc-pcm3168a-spi-objs := pcm3168a-spi.o @@ -338,6 +343,7 @@ obj-$(CONFIG_SND_SOC_ES8328_I2C)+= snd-soc-es8328-i2c.o obj-$(CONFIG_SND_SOC_ES8328_SPI)+= snd-soc-es8328-spi.o obj-$(CONFIG_SND_SOC_GTM601) += snd-soc-gtm601.o obj-$(CONFIG_SND_SOC_HDAC_HDMI) += snd-soc-hdac-hdmi.o +obj-$(CONFIG_SND_SOC_HDAC_HDA) += snd-soc-hdac-hda.o obj-$(CONFIG_SND_SOC_ICS43432) += snd-soc-ics43432.o obj-$(CONFIG_SND_SOC_INNO_RK3036) += snd-soc-inno-rk3036.o obj-$(CONFIG_SND_SOC_ISABELLE) += snd-soc-isabelle.o @@ -366,6 +372,7 @@ obj-$(CONFIG_SND_SOC_MSM8916_WCD_DIGITAL) +=snd-soc-msm8916-digital.o obj-$(CONFIG_SND_SOC_MT6351) += snd-soc-mt6351.o obj-$(CONFIG_SND_SOC_NAU8540) += snd-soc-nau8540.o obj-$(CONFIG_SND_SOC_NAU8810) += snd-soc-nau8810.o +obj-$(CONFIG_SND_SOC_NAU8822) += snd-soc-nau8822.o obj-$(CONFIG_SND_SOC_NAU8824) += snd-soc-nau8824.o obj-$(CONFIG_SND_SOC_NAU8825) += snd-soc-nau8825.o obj-$(CONFIG_SND_SOC_HDMI_CODEC) += snd-soc-hdmi-codec.o @@ -379,6 +386,9 @@ obj-$(CONFIG_SND_SOC_PCM186X) += snd-soc-pcm186x.o obj-$(CONFIG_SND_SOC_PCM186X_I2C) += snd-soc-pcm186x-i2c.o obj-$(CONFIG_SND_SOC_PCM186X_SPI) += snd-soc-pcm186x-spi.o obj-$(CONFIG_SND_SOC_PCM3008) += snd-soc-pcm3008.o +obj-$(CONFIG_SND_SOC_PCM3060) += snd-soc-pcm3060.o +obj-$(CONFIG_SND_SOC_PCM3060_I2C) += snd-soc-pcm3060-i2c.o +obj-$(CONFIG_SND_SOC_PCM3060_SPI) += snd-soc-pcm3060-spi.o obj-$(CONFIG_SND_SOC_PCM3168A) += snd-soc-pcm3168a.o obj-$(CONFIG_SND_SOC_PCM3168A_I2C) += snd-soc-pcm3168a-i2c.o obj-$(CONFIG_SND_SOC_PCM3168A_SPI) += snd-soc-pcm3168a-spi.o diff --git a/sound/soc/codecs/adau1761.c b/sound/soc/codecs/adau1761.c index be136e981653..bef3e9e74c26 100644 --- a/sound/soc/codecs/adau1761.c +++ b/sound/soc/codecs/adau1761.c @@ -518,7 +518,8 @@ static int adau1761_setup_digmic_jackdetect(struct snd_soc_component *component) ARRAY_SIZE(adau1761_jack_detect_controls)); if (ret) return ret; - case ADAU1761_DIGMIC_JACKDET_PIN_MODE_NONE: /* fallthrough */ + /* fall through */ + case ADAU1761_DIGMIC_JACKDET_PIN_MODE_NONE: ret = snd_soc_dapm_add_routes(dapm, adau1761_no_dmic_routes, ARRAY_SIZE(adau1761_no_dmic_routes)); if (ret) diff --git a/sound/soc/codecs/adau17x1.c b/sound/soc/codecs/adau17x1.c index 57169b8ff14e..3959e6ad113d 100644 --- a/sound/soc/codecs/adau17x1.c +++ b/sound/soc/codecs/adau17x1.c @@ -21,11 +21,18 @@ #include #include #include +#include #include "sigmadsp.h" #include "adau17x1.h" #include "adau-utils.h" +#define ADAU17X1_SAFELOAD_TARGET_ADDRESS 0x0006 +#define ADAU17X1_SAFELOAD_TRIGGER 0x0007 +#define ADAU17X1_SAFELOAD_DATA 0x0001 +#define ADAU17X1_SAFELOAD_DATA_SIZE 20 +#define ADAU17X1_WORD_SIZE 4 + static const char * const adau17x1_capture_mixer_boost_text[] = { "Normal operation", "Boost Level 1", "Boost Level 2", "Boost Level 3", }; @@ -60,6 +67,9 @@ static const struct snd_kcontrol_new adau17x1_controls[] = { SOC_ENUM("Mic Bias Mode", adau17x1_mic_bias_mode_enum), }; +static int adau17x1_setup_firmware(struct snd_soc_component *component, + unsigned int rate); + static int adau17x1_pll_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) { @@ -313,7 +323,7 @@ static const struct snd_soc_dapm_route adau17x1_no_dsp_dapm_routes[] = { { "Capture", NULL, "Right Decimator" }, }; -bool adau17x1_has_dsp(struct adau *adau) +static bool adau17x1_has_dsp(struct adau *adau) { switch (adau->type) { case ADAU1761: @@ -324,7 +334,17 @@ bool adau17x1_has_dsp(struct adau *adau) return false; } } -EXPORT_SYMBOL_GPL(adau17x1_has_dsp); + +static bool adau17x1_has_safeload(struct adau *adau) +{ + switch (adau->type) { + case ADAU1761: + case ADAU1781: + return true; + default: + return false; + } +} static int adau17x1_set_dai_pll(struct snd_soc_dai *dai, int pll_id, int source, unsigned int freq_in, unsigned int freq_out) @@ -836,7 +856,7 @@ bool adau17x1_volatile_register(struct device *dev, unsigned int reg) } EXPORT_SYMBOL_GPL(adau17x1_volatile_register); -int adau17x1_setup_firmware(struct snd_soc_component *component, +static int adau17x1_setup_firmware(struct snd_soc_component *component, unsigned int rate) { int ret; @@ -880,7 +900,6 @@ err: return ret; } -EXPORT_SYMBOL_GPL(adau17x1_setup_firmware); int adau17x1_add_widgets(struct snd_soc_component *component) { @@ -957,6 +976,56 @@ int adau17x1_resume(struct snd_soc_component *component) } EXPORT_SYMBOL_GPL(adau17x1_resume); +static int adau17x1_safeload(struct sigmadsp *sigmadsp, unsigned int addr, + const uint8_t bytes[], size_t len) +{ + uint8_t buf[ADAU17X1_WORD_SIZE]; + uint8_t data[ADAU17X1_SAFELOAD_DATA_SIZE]; + unsigned int addr_offset; + unsigned int nbr_words; + int ret; + + /* write data to safeload addresses. Check if len is not a multiple of + * 4 bytes, if so we need to zero pad. + */ + nbr_words = len / ADAU17X1_WORD_SIZE; + if ((len - nbr_words * ADAU17X1_WORD_SIZE) == 0) { + ret = regmap_raw_write(sigmadsp->control_data, + ADAU17X1_SAFELOAD_DATA, bytes, len); + } else { + nbr_words++; + memset(data, 0, ADAU17X1_SAFELOAD_DATA_SIZE); + memcpy(data, bytes, len); + ret = regmap_raw_write(sigmadsp->control_data, + ADAU17X1_SAFELOAD_DATA, data, + nbr_words * ADAU17X1_WORD_SIZE); + } + + if (ret < 0) + return ret; + + /* Write target address, target address is offset by 1 */ + addr_offset = addr - 1; + put_unaligned_be32(addr_offset, buf); + ret = regmap_raw_write(sigmadsp->control_data, + ADAU17X1_SAFELOAD_TARGET_ADDRESS, buf, ADAU17X1_WORD_SIZE); + if (ret < 0) + return ret; + + /* write nbr of words to trigger address */ + put_unaligned_be32(nbr_words, buf); + ret = regmap_raw_write(sigmadsp->control_data, + ADAU17X1_SAFELOAD_TRIGGER, buf, ADAU17X1_WORD_SIZE); + if (ret < 0) + return ret; + + return 0; +} + +static const struct sigmadsp_ops adau17x1_sigmadsp_ops = { + .safeload = adau17x1_safeload, +}; + int adau17x1_probe(struct device *dev, struct regmap *regmap, enum adau17x1_type type, void (*switch_mode)(struct device *dev), const char *firmware_name) @@ -1002,8 +1071,13 @@ int adau17x1_probe(struct device *dev, struct regmap *regmap, dev_set_drvdata(dev, adau); if (firmware_name) { - adau->sigmadsp = devm_sigmadsp_init_regmap(dev, regmap, NULL, - firmware_name); + if (adau17x1_has_safeload(adau)) { + adau->sigmadsp = devm_sigmadsp_init_regmap(dev, regmap, + &adau17x1_sigmadsp_ops, firmware_name); + } else { + adau->sigmadsp = devm_sigmadsp_init_regmap(dev, regmap, + NULL, firmware_name); + } if (IS_ERR(adau->sigmadsp)) { dev_warn(dev, "Could not find firmware file: %ld\n", PTR_ERR(adau->sigmadsp)); diff --git a/sound/soc/codecs/adau17x1.h b/sound/soc/codecs/adau17x1.h index e6fe87beec07..98a3b6f5bc96 100644 --- a/sound/soc/codecs/adau17x1.h +++ b/sound/soc/codecs/adau17x1.h @@ -68,10 +68,6 @@ int adau17x1_resume(struct snd_soc_component *component); extern const struct snd_soc_dai_ops adau17x1_dai_ops; -int adau17x1_setup_firmware(struct snd_soc_component *component, - unsigned int rate); -bool adau17x1_has_dsp(struct adau *adau); - #define ADAU17X1_CLOCK_CONTROL 0x4000 #define ADAU17X1_PLL_CONTROL 0x4002 #define ADAU17X1_REC_POWER_MGMT 0x4009 diff --git a/sound/soc/codecs/cs35l33.c b/sound/soc/codecs/cs35l33.c index 668cd3754209..e9b7f72d880b 100644 --- a/sound/soc/codecs/cs35l33.c +++ b/sound/soc/codecs/cs35l33.c @@ -857,7 +857,8 @@ static const struct regmap_config cs35l33_regmap = { .readable_reg = cs35l33_readable_register, .writeable_reg = cs35l33_writeable_register, .cache_type = REGCACHE_RBTREE, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static int __maybe_unused cs35l33_runtime_resume(struct device *dev) diff --git a/sound/soc/codecs/cs35l35.c b/sound/soc/codecs/cs35l35.c index bd6226bde45f..9f4a59871cee 100644 --- a/sound/soc/codecs/cs35l35.c +++ b/sound/soc/codecs/cs35l35.c @@ -1105,7 +1105,8 @@ static struct regmap_config cs35l35_regmap = { .readable_reg = cs35l35_readable_register, .precious_reg = cs35l35_precious_register, .cache_type = REGCACHE_RBTREE, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static irqreturn_t cs35l35_irq(int irq, void *data) diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c index 407554175282..ab27d2b94d02 100644 --- a/sound/soc/codecs/cs4265.c +++ b/sound/soc/codecs/cs4265.c @@ -154,11 +154,11 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = { SOC_SINGLE("E to F Buffer Disable Switch", CS4265_SPDIF_CTL1, 6, 1, 0), SOC_ENUM("C Data Access", cam_mode_enum), + SOC_SINGLE("SPDIF Switch", CS4265_SPDIF_CTL2, 5, 1, 1), SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2, 3, 1, 0), SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum), - SOC_SINGLE("MMTLR Data Switch", CS4265_SPDIF_CTL2, - 0, 1, 0), + SOC_SINGLE("MMTLR Data Switch", CS4265_SPDIF_CTL2, 0, 1, 0), SOC_ENUM("Mono Channel Select", spdif_mono_select_enum), SND_SOC_BYTES("C Data Buffer", CS4265_C_DATA_BUFF, 24), }; @@ -221,10 +221,11 @@ static const struct snd_soc_dapm_route cs4265_audio_map[] = { {"LINEOUTR", NULL, "DAC"}, {"SPDIFOUT", NULL, "SPDIF"}, + {"Pre-amp MIC", NULL, "MICL"}, + {"Pre-amp MIC", NULL, "MICR"}, + {"ADC Mux", "MIC", "Pre-amp MIC"}, {"ADC Mux", "LINEIN", "LINEINL"}, {"ADC Mux", "LINEIN", "LINEINR"}, - {"ADC Mux", "MIC", "MICL"}, - {"ADC Mux", "MIC", "MICR"}, {"ADC", NULL, "ADC Mux"}, {"DOUT", NULL, "ADC"}, {"DAI1 Capture", NULL, "DOUT"}, @@ -496,7 +497,8 @@ static int cs4265_set_bias_level(struct snd_soc_component *component, SNDRV_PCM_RATE_176400 | SNDRV_PCM_RATE_192000) #define CS4265_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_U16_LE | \ - SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_U24_LE) + SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_U24_LE | \ + SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_U32_LE) static const struct snd_soc_dai_ops cs4265_ops = { .hw_params = cs4265_pcm_hw_params, diff --git a/sound/soc/codecs/cs42l51.c b/sound/soc/codecs/cs42l51.c index 5080d7a3c279..fd2bd74024c1 100644 --- a/sound/soc/codecs/cs42l51.c +++ b/sound/soc/codecs/cs42l51.c @@ -21,6 +21,7 @@ * - master mode *NOT* supported */ +#include #include #include #include @@ -41,6 +42,7 @@ enum master_slave_mode { struct cs42l51_private { unsigned int mclk; + struct clk *mclk_handle; unsigned int audio_mode; /* The mode (I2S or left-justified) */ enum master_slave_mode func; }; @@ -237,6 +239,10 @@ static const struct snd_soc_dapm_widget cs42l51_dapm_widgets[] = { &cs42l51_adcr_mux_controls), }; +static const struct snd_soc_dapm_widget cs42l51_dapm_mclk_widgets[] = { + SND_SOC_DAPM_CLOCK_SUPPLY("MCLK") +}; + static const struct snd_soc_dapm_route cs42l51_routes[] = { {"HPL", NULL, "Left DAC"}, {"HPR", NULL, "Right DAC"}, @@ -487,6 +493,14 @@ static struct snd_soc_dai_driver cs42l51_dai = { static int cs42l51_component_probe(struct snd_soc_component *component) { int ret, reg; + struct snd_soc_dapm_context *dapm; + struct cs42l51_private *cs42l51; + + cs42l51 = snd_soc_component_get_drvdata(component); + dapm = snd_soc_component_get_dapm(component); + + if (cs42l51->mclk_handle) + snd_soc_dapm_new_controls(dapm, cs42l51_dapm_mclk_widgets, 1); /* * DAC configuration @@ -540,6 +554,13 @@ int cs42l51_probe(struct device *dev, struct regmap *regmap) dev_set_drvdata(dev, cs42l51); + cs42l51->mclk_handle = devm_clk_get(dev, "MCLK"); + if (IS_ERR(cs42l51->mclk_handle)) { + if (PTR_ERR(cs42l51->mclk_handle) != -ENOENT) + return PTR_ERR(cs42l51->mclk_handle); + cs42l51->mclk_handle = NULL; + } + /* Verify that we have a CS42L51 */ ret = regmap_read(regmap, CS42L51_CHIP_REV_ID, &val); if (ret < 0) { diff --git a/sound/soc/codecs/cs43130.c b/sound/soc/codecs/cs43130.c index 80dc42197154..3f7b255587e6 100644 --- a/sound/soc/codecs/cs43130.c +++ b/sound/soc/codecs/cs43130.c @@ -2362,7 +2362,9 @@ static const struct regmap_config cs43130_regmap = { .precious_reg = cs43130_precious_register, .volatile_reg = cs43130_volatile_register, .cache_type = REGCACHE_RBTREE, - .use_single_rw = true, /* needed for regcache_sync */ + /* needed for regcache_sync */ + .use_single_read = true, + .use_single_write = true, }; static u16 const cs43130_dc_threshold[CS43130_DC_THRESHOLD] = { diff --git a/sound/soc/codecs/dmic.c b/sound/soc/codecs/dmic.c index 8c4926df9286..71322e0410ee 100644 --- a/sound/soc/codecs/dmic.c +++ b/sound/soc/codecs/dmic.c @@ -148,6 +148,7 @@ static const struct of_device_id dmic_dev_match[] = { {.compatible = "dmic-codec"}, {} }; +MODULE_DEVICE_TABLE(of, dmic_dev_match); static struct platform_driver dmic_driver = { .driver = { diff --git a/sound/soc/codecs/es8328.c b/sound/soc/codecs/es8328.c index e9fc2fd97d2f..04a3aa770722 100644 --- a/sound/soc/codecs/es8328.c +++ b/sound/soc/codecs/es8328.c @@ -566,14 +566,14 @@ static int es8328_set_sysclk(struct snd_soc_dai *codec_dai, break; case 22579200: mclkdiv2 = 1; - /* fallthru */ + /* fall through */ case 11289600: es8328->sysclk_constraints = &constraints_11289; es8328->mclk_ratios = ratios_11289; break; case 24576000: mclkdiv2 = 1; - /* fallthru */ + /* fall through */ case 12288000: es8328->sysclk_constraints = &constraints_12288; es8328->mclk_ratios = ratios_12288; @@ -824,7 +824,8 @@ const struct regmap_config es8328_regmap_config = { .val_bits = 8, .max_register = ES8328_REG_MAX, .cache_type = REGCACHE_RBTREE, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; EXPORT_SYMBOL_GPL(es8328_regmap_config); diff --git a/sound/soc/codecs/hdac_hda.c b/sound/soc/codecs/hdac_hda.c new file mode 100644 index 000000000000..2aaa83028e55 --- /dev/null +++ b/sound/soc/codecs/hdac_hda.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2015-18 Intel Corporation. + +/* + * hdac_hda.c - ASoC extensions to reuse the legacy HDA codec drivers + * with ASoC platform drivers. These APIs are called by the legacy HDA + * codec drivers using hdac_ext_bus_ops ops. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hdac_hda.h" + +#define HDAC_ANALOG_DAI_ID 0 +#define HDAC_DIGITAL_DAI_ID 1 +#define HDAC_ALT_ANALOG_DAI_ID 2 + +#define STUB_FORMATS (SNDRV_PCM_FMTBIT_S8 | \ + SNDRV_PCM_FMTBIT_U8 | \ + SNDRV_PCM_FMTBIT_S16_LE | \ + SNDRV_PCM_FMTBIT_U16_LE | \ + SNDRV_PCM_FMTBIT_S24_LE | \ + SNDRV_PCM_FMTBIT_U24_LE | \ + SNDRV_PCM_FMTBIT_S32_LE | \ + SNDRV_PCM_FMTBIT_U32_LE | \ + SNDRV_PCM_FMTBIT_IEC958_SUBFRAME_LE) + +static int hdac_hda_dai_open(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +static void hdac_hda_dai_close(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +static int hdac_hda_dai_prepare(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +static int hdac_hda_dai_hw_free(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +static int hdac_hda_dai_set_tdm_slot(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, + int slots, int slot_width); +static struct hda_pcm *snd_soc_find_pcm_from_dai(struct hdac_hda_priv *hda_pvt, + struct snd_soc_dai *dai); + +static struct snd_soc_dai_ops hdac_hda_dai_ops = { + .startup = hdac_hda_dai_open, + .shutdown = hdac_hda_dai_close, + .prepare = hdac_hda_dai_prepare, + .hw_free = hdac_hda_dai_hw_free, + .set_tdm_slot = hdac_hda_dai_set_tdm_slot, +}; + +static struct snd_soc_dai_driver hdac_hda_dais[] = { +{ + .id = HDAC_ANALOG_DAI_ID, + .name = "Analog Codec DAI", + .ops = &hdac_hda_dai_ops, + .playback = { + .stream_name = "Analog Codec Playback", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, + .capture = { + .stream_name = "Analog Codec Capture", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, +}, +{ + .id = HDAC_DIGITAL_DAI_ID, + .name = "Digital Codec DAI", + .ops = &hdac_hda_dai_ops, + .playback = { + .stream_name = "Digital Codec Playback", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, + .capture = { + .stream_name = "Digital Codec Capture", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, +}, +{ + .id = HDAC_ALT_ANALOG_DAI_ID, + .name = "Alt Analog Codec DAI", + .ops = &hdac_hda_dai_ops, + .playback = { + .stream_name = "Alt Analog Codec Playback", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, + .capture = { + .stream_name = "Alt Analog Codec Capture", + .channels_min = 1, + .channels_max = 16, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = STUB_FORMATS, + .sig_bits = 24, + }, +} + +}; + +static int hdac_hda_dai_set_tdm_slot(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, + int slots, int slot_width) +{ + struct snd_soc_component *component = dai->component; + struct hdac_hda_priv *hda_pvt; + struct hdac_hda_pcm *pcm; + + hda_pvt = snd_soc_component_get_drvdata(component); + pcm = &hda_pvt->pcm[dai->id]; + if (tx_mask) + pcm[dai->id].stream_tag[SNDRV_PCM_STREAM_PLAYBACK] = tx_mask; + else + pcm[dai->id].stream_tag[SNDRV_PCM_STREAM_CAPTURE] = rx_mask; + + return 0; +} + +static int hdac_hda_dai_hw_free(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct hdac_hda_priv *hda_pvt; + struct hda_pcm_stream *hda_stream; + struct hda_pcm *pcm; + + hda_pvt = snd_soc_component_get_drvdata(component); + pcm = snd_soc_find_pcm_from_dai(hda_pvt, dai); + if (!pcm) + return -EINVAL; + + hda_stream = &pcm->stream[substream->stream]; + snd_hda_codec_cleanup(&hda_pvt->codec, hda_stream, substream); + + return 0; +} + +static int hdac_hda_dai_prepare(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct hdac_hda_priv *hda_pvt; + struct snd_pcm_runtime *runtime = substream->runtime; + struct hdac_device *hdev; + struct hda_pcm_stream *hda_stream; + unsigned int format_val; + struct hda_pcm *pcm; + unsigned int stream; + int ret = 0; + + hda_pvt = snd_soc_component_get_drvdata(component); + hdev = &hda_pvt->codec.core; + pcm = snd_soc_find_pcm_from_dai(hda_pvt, dai); + if (!pcm) + return -EINVAL; + + hda_stream = &pcm->stream[substream->stream]; + + format_val = snd_hdac_calc_stream_format(runtime->rate, + runtime->channels, + runtime->format, + hda_stream->maxbps, + 0); + if (!format_val) { + dev_err(&hdev->dev, + "invalid format_val, rate=%d, ch=%d, format=%d\n", + runtime->rate, runtime->channels, runtime->format); + return -EINVAL; + } + + stream = hda_pvt->pcm[dai->id].stream_tag[substream->stream]; + + ret = snd_hda_codec_prepare(&hda_pvt->codec, hda_stream, + stream, format_val, substream); + if (ret < 0) + dev_err(&hdev->dev, "codec prepare failed %d\n", ret); + + return ret; +} + +static int hdac_hda_dai_open(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct hdac_hda_priv *hda_pvt; + struct hda_pcm_stream *hda_stream; + struct hda_pcm *pcm; + int ret; + + hda_pvt = snd_soc_component_get_drvdata(component); + pcm = snd_soc_find_pcm_from_dai(hda_pvt, dai); + if (!pcm) + return -EINVAL; + + snd_hda_codec_pcm_get(pcm); + + hda_stream = &pcm->stream[substream->stream]; + + ret = hda_stream->ops.open(hda_stream, &hda_pvt->codec, substream); + if (ret < 0) + snd_hda_codec_pcm_put(pcm); + + return ret; +} + +static void hdac_hda_dai_close(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct hdac_hda_priv *hda_pvt; + struct hda_pcm_stream *hda_stream; + struct hda_pcm *pcm; + + hda_pvt = snd_soc_component_get_drvdata(component); + pcm = snd_soc_find_pcm_from_dai(hda_pvt, dai); + if (!pcm) + return; + + hda_stream = &pcm->stream[substream->stream]; + + hda_stream->ops.close(hda_stream, &hda_pvt->codec, substream); + + snd_hda_codec_pcm_put(pcm); +} + +static struct hda_pcm *snd_soc_find_pcm_from_dai(struct hdac_hda_priv *hda_pvt, + struct snd_soc_dai *dai) +{ + struct hda_codec *hcodec = &hda_pvt->codec; + struct hda_pcm *cpcm; + const char *pcm_name; + + switch (dai->id) { + case HDAC_ANALOG_DAI_ID: + pcm_name = "Analog"; + break; + case HDAC_DIGITAL_DAI_ID: + pcm_name = "Digital"; + break; + case HDAC_ALT_ANALOG_DAI_ID: + pcm_name = "Alt Analog"; + break; + default: + dev_err(&hcodec->core.dev, "invalid dai id %d\n", dai->id); + return NULL; + } + + list_for_each_entry(cpcm, &hcodec->pcm_list_head, list) { + if (strpbrk(cpcm->name, pcm_name)) + return cpcm; + } + + dev_err(&hcodec->core.dev, "didn't find PCM for DAI %s\n", dai->name); + return NULL; +} + +static int hdac_hda_codec_probe(struct snd_soc_component *component) +{ + struct hdac_hda_priv *hda_pvt = + snd_soc_component_get_drvdata(component); + struct snd_soc_dapm_context *dapm = + snd_soc_component_get_dapm(component); + struct hdac_device *hdev = &hda_pvt->codec.core; + struct hda_codec *hcodec = &hda_pvt->codec; + struct hdac_ext_link *hlink; + hda_codec_patch_t patch; + int ret; + + hlink = snd_hdac_ext_bus_get_link(hdev->bus, dev_name(&hdev->dev)); + if (!hlink) { + dev_err(&hdev->dev, "hdac link not found\n"); + return -EIO; + } + + snd_hdac_ext_bus_link_get(hdev->bus, hlink); + + ret = snd_hda_codec_device_new(hcodec->bus, component->card->snd_card, + hdev->addr, hcodec); + if (ret < 0) { + dev_err(&hdev->dev, "failed to create hda codec %d\n", ret); + goto error_no_pm; + } + + /* + * snd_hda_codec_device_new decrements the usage count so call get pm + * else the device will be powered off + */ + pm_runtime_get_noresume(&hdev->dev); + + hcodec->bus->card = dapm->card->snd_card; + + ret = snd_hda_codec_set_name(hcodec, hcodec->preset->name); + if (ret < 0) { + dev_err(&hdev->dev, "name failed %s\n", hcodec->preset->name); + goto error; + } + + ret = snd_hdac_regmap_init(&hcodec->core); + if (ret < 0) { + dev_err(&hdev->dev, "regmap init failed\n"); + goto error; + } + + patch = (hda_codec_patch_t)hcodec->preset->driver_data; + if (patch) { + ret = patch(hcodec); + if (ret < 0) { + dev_err(&hdev->dev, "patch failed %d\n", ret); + goto error; + } + } else { + dev_dbg(&hdev->dev, "no patch file found\n"); + } + + ret = snd_hda_codec_parse_pcms(hcodec); + if (ret < 0) { + dev_err(&hdev->dev, "unable to map pcms to dai %d\n", ret); + goto error; + } + + ret = snd_hda_codec_build_controls(hcodec); + if (ret < 0) { + dev_err(&hdev->dev, "unable to create controls %d\n", ret); + goto error; + } + + hcodec->core.lazy_cache = true; + + /* + * hdac_device core already sets the state to active and calls + * get_noresume. So enable runtime and set the device to suspend. + * pm_runtime_enable is also called during codec registeration + */ + pm_runtime_put(&hdev->dev); + pm_runtime_suspend(&hdev->dev); + + return 0; + +error: + pm_runtime_put(&hdev->dev); +error_no_pm: + snd_hdac_ext_bus_link_put(hdev->bus, hlink); + return ret; +} + +static void hdac_hda_codec_remove(struct snd_soc_component *component) +{ + struct hdac_hda_priv *hda_pvt = + snd_soc_component_get_drvdata(component); + struct hdac_device *hdev = &hda_pvt->codec.core; + struct hdac_ext_link *hlink = NULL; + + hlink = snd_hdac_ext_bus_get_link(hdev->bus, dev_name(&hdev->dev)); + if (!hlink) { + dev_err(&hdev->dev, "hdac link not found\n"); + return; + } + + snd_hdac_ext_bus_link_put(hdev->bus, hlink); + pm_runtime_disable(&hdev->dev); +} + +static const struct snd_soc_dapm_route hdac_hda_dapm_routes[] = { + {"AIF1TX", NULL, "Codec Input Pin1"}, + {"AIF2TX", NULL, "Codec Input Pin2"}, + {"AIF3TX", NULL, "Codec Input Pin3"}, + + {"Codec Output Pin1", NULL, "AIF1RX"}, + {"Codec Output Pin2", NULL, "AIF2RX"}, + {"Codec Output Pin3", NULL, "AIF3RX"}, +}; + +static const struct snd_soc_dapm_widget hdac_hda_dapm_widgets[] = { + /* Audio Interface */ + SND_SOC_DAPM_AIF_IN("AIF1RX", "Analog Codec Playback", 0, + SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_IN("AIF2RX", "Digital Codec Playback", 0, + SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_IN("AIF3RX", "Alt Analog Codec Playback", 0, + SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("AIF1TX", "Analog Codec Capture", 0, + SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("AIF2TX", "Digital Codec Capture", 0, + SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("AIF3TX", "Alt Analog Codec Capture", 0, + SND_SOC_NOPM, 0, 0), + + /* Input Pins */ + SND_SOC_DAPM_INPUT("Codec Input Pin1"), + SND_SOC_DAPM_INPUT("Codec Input Pin2"), + SND_SOC_DAPM_INPUT("Codec Input Pin3"), + + /* Output Pins */ + SND_SOC_DAPM_OUTPUT("Codec Output Pin1"), + SND_SOC_DAPM_OUTPUT("Codec Output Pin2"), + SND_SOC_DAPM_OUTPUT("Codec Output Pin3"), +}; + +static const struct snd_soc_component_driver hdac_hda_codec = { + .probe = hdac_hda_codec_probe, + .remove = hdac_hda_codec_remove, + .idle_bias_on = false, + .dapm_widgets = hdac_hda_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(hdac_hda_dapm_widgets), + .dapm_routes = hdac_hda_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(hdac_hda_dapm_routes), +}; + +static int hdac_hda_dev_probe(struct hdac_device *hdev) +{ + struct hdac_ext_link *hlink; + struct hdac_hda_priv *hda_pvt; + int ret; + + /* hold the ref while we probe */ + hlink = snd_hdac_ext_bus_get_link(hdev->bus, dev_name(&hdev->dev)); + if (!hlink) { + dev_err(&hdev->dev, "hdac link not found\n"); + return -EIO; + } + snd_hdac_ext_bus_link_get(hdev->bus, hlink); + + hda_pvt = hdac_to_hda_priv(hdev); + if (!hda_pvt) + return -ENOMEM; + + /* ASoC specific initialization */ + ret = devm_snd_soc_register_component(&hdev->dev, + &hdac_hda_codec, hdac_hda_dais, + ARRAY_SIZE(hdac_hda_dais)); + if (ret < 0) { + dev_err(&hdev->dev, "failed to register HDA codec %d\n", ret); + return ret; + } + + dev_set_drvdata(&hdev->dev, hda_pvt); + snd_hdac_ext_bus_link_put(hdev->bus, hlink); + + return ret; +} + +static int hdac_hda_dev_remove(struct hdac_device *hdev) +{ + return 0; +} + +static struct hdac_ext_bus_ops hdac_ops = { + .hdev_attach = hdac_hda_dev_probe, + .hdev_detach = hdac_hda_dev_remove, +}; + +struct hdac_ext_bus_ops *snd_soc_hdac_hda_get_ops(void) +{ + return &hdac_ops; +} +EXPORT_SYMBOL_GPL(snd_soc_hdac_hda_get_ops); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ASoC Extensions for legacy HDA Drivers"); +MODULE_AUTHOR("Rakesh Ughreja"); diff --git a/sound/soc/codecs/hdac_hda.h b/sound/soc/codecs/hdac_hda.h new file mode 100644 index 000000000000..e444ef593360 --- /dev/null +++ b/sound/soc/codecs/hdac_hda.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2015-18 Intel Corporation. + */ + +#ifndef __HDAC_HDA_H__ +#define __HDAC_HDA_H__ + +struct hdac_hda_pcm { + int stream_tag[2]; +}; + +struct hdac_hda_priv { + struct hda_codec codec; + struct hdac_hda_pcm pcm[2]; +}; + +#define hdac_to_hda_priv(_hdac) \ + container_of(_hdac, struct hdac_hda_priv, codec.core) +#define hdac_to_hda_codec(_hdac) container_of(_hdac, struct hda_codec, core) + +struct hdac_ext_bus_ops *snd_soc_hdac_hda_get_ops(void); + +#endif /* __HDAC_HDA_H__ */ diff --git a/sound/soc/codecs/hdac_hdmi.c b/sound/soc/codecs/hdac_hdmi.c index 7b8533abf637..4e9854889a95 100644 --- a/sound/soc/codecs/hdac_hdmi.c +++ b/sound/soc/codecs/hdac_hdmi.c @@ -1410,6 +1410,12 @@ static int hdac_hdmi_create_dais(struct hdac_device *hdev, if (ret) return ret; + /* Filter out 44.1, 88.2 and 176.4Khz */ + rates &= ~(SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_88200 | + SNDRV_PCM_RATE_176400); + if (!rates) + return -EINVAL; + sprintf(dai_name, "intel-hdmi-hifi%d", i+1); hdmi_dais[i].name = devm_kstrdup(&hdev->dev, dai_name, GFP_KERNEL); @@ -1598,7 +1604,7 @@ static struct snd_pcm *hdac_hdmi_get_pcm_from_id(struct snd_soc_card *card, { struct snd_soc_pcm_runtime *rtd; - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->pcm && (rtd->pcm->device == device)) return rtd->pcm; } @@ -1961,9 +1967,6 @@ static int hdac_hdmi_get_spk_alloc(struct hdac_device *hdev, int pcm_idx) port = list_first_entry(&pcm->port_list, struct hdac_hdmi_port, head); - if (!port) - return 0; - if (!port || !port->eld.eld_valid) return 0; diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c index fb515aaa54fc..ca172a4b6849 100644 --- a/sound/soc/codecs/max98088.c +++ b/sound/soc/codecs/max98088.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ struct max98088_priv { struct regmap *regmap; enum max98088_type devtype; struct max98088_pdata *pdata; + struct clk *mclk; unsigned int sysclk; struct max98088_cdata dai[2]; int eq_textcnt; @@ -1103,6 +1105,11 @@ static int max98088_dai_set_sysclk(struct snd_soc_dai *dai, if (freq == max98088->sysclk) return 0; + if (!IS_ERR(max98088->mclk)) { + freq = clk_round_rate(max98088->mclk, freq); + clk_set_rate(max98088->mclk, freq); + } + /* Setup clocks for slave mode, and using the PLL * PSCLK = 0x01 (when master clk is 10MHz to 20MHz) * 0x02 (when master clk is 20MHz to 30MHz).. @@ -1310,6 +1317,20 @@ static int max98088_set_bias_level(struct snd_soc_component *component, break; case SND_SOC_BIAS_PREPARE: + /* + * SND_SOC_BIAS_PREPARE is called while preparing for a + * transition to ON or away from ON. If current bias_level + * is SND_SOC_BIAS_ON, then it is preparing for a transition + * away from ON. Disable the clock in that case, otherwise + * enable it. + */ + if (!IS_ERR(max98088->mclk)) { + if (snd_soc_component_get_bias_level(component) == + SND_SOC_BIAS_ON) + clk_disable_unprepare(max98088->mclk); + else + clk_prepare_enable(max98088->mclk); + } break; case SND_SOC_BIAS_STANDBY: @@ -1725,6 +1746,11 @@ static int max98088_i2c_probe(struct i2c_client *i2c, if (IS_ERR(max98088->regmap)) return PTR_ERR(max98088->regmap); + max98088->mclk = devm_clk_get(&i2c->dev, "mclk"); + if (IS_ERR(max98088->mclk)) + if (PTR_ERR(max98088->mclk) == -EPROBE_DEFER) + return PTR_ERR(max98088->mclk); + max98088->devtype = id->driver_data; i2c_set_clientdata(i2c, max98088); @@ -1742,9 +1768,19 @@ static const struct i2c_device_id max98088_i2c_id[] = { }; MODULE_DEVICE_TABLE(i2c, max98088_i2c_id); +#if defined(CONFIG_OF) +static const struct of_device_id max98088_of_match[] = { + { .compatible = "maxim,max98088" }, + { .compatible = "maxim,max98089" }, + { } +}; +MODULE_DEVICE_TABLE(of, max98088_of_match); +#endif + static struct i2c_driver max98088_i2c_driver = { .driver = { .name = "max98088", + .of_match_table = of_match_ptr(max98088_of_match), }, .probe = max98088_i2c_probe, .id_table = max98088_i2c_id, diff --git a/sound/soc/codecs/max98373.c b/sound/soc/codecs/max98373.c index 1093f766d0d2..a09d01318f79 100644 --- a/sound/soc/codecs/max98373.c +++ b/sound/soc/codecs/max98373.c @@ -2,6 +2,7 @@ // Copyright (c) 2017, Maxim Integrated #include +#include #include #include #include @@ -454,7 +455,7 @@ SND_SOC_DAPM_SIGGEN("IMON"), SND_SOC_DAPM_SIGGEN("FBMON"), }; -static DECLARE_TLV_DB_SCALE(max98373_digital_tlv, 0, -50, 0); +static DECLARE_TLV_DB_SCALE(max98373_digital_tlv, -6350, 50, 1); static const DECLARE_TLV_DB_RANGE(max98373_spk_tlv, 0, 8, TLV_DB_SCALE_ITEM(0, 50, 0), 9, 10, TLV_DB_SCALE_ITEM(500, 100, 0), @@ -470,19 +471,19 @@ static const DECLARE_TLV_DB_RANGE(max98373_dht_spkgain_min_tlv, 0, 9, TLV_DB_SCALE_ITEM(800, 100, 0), ); static const DECLARE_TLV_DB_RANGE(max98373_dht_rotation_point_tlv, - 0, 1, TLV_DB_SCALE_ITEM(-50, -50, 0), - 2, 7, TLV_DB_SCALE_ITEM(-200, -100, 0), - 8, 9, TLV_DB_SCALE_ITEM(-1000, -200, 0), - 10, 11, TLV_DB_SCALE_ITEM(-1500, -300, 0), - 12, 13, TLV_DB_SCALE_ITEM(-2000, -200, 0), - 14, 15, TLV_DB_SCALE_ITEM(-2500, -500, 0), + 0, 1, TLV_DB_SCALE_ITEM(-3000, 500, 0), + 2, 4, TLV_DB_SCALE_ITEM(-2200, 200, 0), + 5, 6, TLV_DB_SCALE_ITEM(-1500, 300, 0), + 7, 9, TLV_DB_SCALE_ITEM(-1000, 200, 0), + 10, 13, TLV_DB_SCALE_ITEM(-500, 100, 0), + 14, 15, TLV_DB_SCALE_ITEM(-100, 50, 0), ); static const DECLARE_TLV_DB_RANGE(max98373_limiter_thresh_tlv, - 0, 15, TLV_DB_SCALE_ITEM(0, -100, 0), + 0, 15, TLV_DB_SCALE_ITEM(-1500, 100, 0), ); static const DECLARE_TLV_DB_RANGE(max98373_bde_gain_tlv, - 0, 60, TLV_DB_SCALE_ITEM(0, -25, 0), + 0, 60, TLV_DB_SCALE_ITEM(-1500, 25, 0), ); static bool max98373_readable_register(struct device *dev, unsigned int reg) @@ -604,7 +605,7 @@ SOC_SINGLE("Dither Switch", MAX98373_R203F_AMP_DSP_CFG, SOC_SINGLE("DC Blocker Switch", MAX98373_R203F_AMP_DSP_CFG, MAX98373_AMP_DSP_CFG_DCBLK_SHIFT, 1, 0), SOC_SINGLE_TLV("Digital Volume", MAX98373_R203D_AMP_DIG_VOL_CTRL, - 0, 0x7F, 0, max98373_digital_tlv), + 0, 0x7F, 1, max98373_digital_tlv), SOC_SINGLE_TLV("Speaker Volume", MAX98373_R203E_AMP_PATH_GAIN, MAX98373_SPK_DIGI_GAIN_SHIFT, 10, 0, max98373_spk_tlv), SOC_SINGLE_TLV("FS Max Volume", MAX98373_R203E_AMP_PATH_GAIN, @@ -616,7 +617,7 @@ SOC_SINGLE("DHT Switch", MAX98373_R20D4_DHT_EN, SOC_SINGLE_TLV("DHT Min Volume", MAX98373_R20D1_DHT_CFG, MAX98373_DHT_SPK_GAIN_MIN_SHIFT, 9, 0, max98373_dht_spkgain_min_tlv), SOC_SINGLE_TLV("DHT Rot Pnt Volume", MAX98373_R20D1_DHT_CFG, - MAX98373_DHT_ROT_PNT_SHIFT, 15, 0, max98373_dht_rotation_point_tlv), + MAX98373_DHT_ROT_PNT_SHIFT, 15, 1, max98373_dht_rotation_point_tlv), SOC_SINGLE_TLV("DHT Attack Step Volume", MAX98373_R20D2_DHT_ATTACK_CFG, MAX98373_DHT_ATTACK_STEP_SHIFT, 4, 0, max98373_dht_step_size_tlv), SOC_SINGLE_TLV("DHT Release Step Volume", MAX98373_R20D3_DHT_RELEASE_CFG, @@ -653,29 +654,29 @@ SOC_SINGLE("BDE Hold Time", MAX98373_R2090_BDE_LVL_HOLD, 0, 0xFF, 0), SOC_SINGLE("BDE Attack Rate", MAX98373_R2091_BDE_GAIN_ATK_REL_RATE, 4, 0xF, 0), SOC_SINGLE("BDE Release Rate", MAX98373_R2091_BDE_GAIN_ATK_REL_RATE, 0, 0xF, 0), SOC_SINGLE_TLV("BDE LVL1 Clip Thresh Volume", MAX98373_R20A9_BDE_L1_CFG_2, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL2 Clip Thresh Volume", MAX98373_R20AC_BDE_L2_CFG_2, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL3 Clip Thresh Volume", MAX98373_R20AF_BDE_L3_CFG_2, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL4 Clip Thresh Volume", MAX98373_R20B2_BDE_L4_CFG_2, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL1 Clip Reduction Volume", MAX98373_R20AA_BDE_L1_CFG_3, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL2 Clip Reduction Volume", MAX98373_R20AD_BDE_L2_CFG_3, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL3 Clip Reduction Volume", MAX98373_R20B0_BDE_L3_CFG_3, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL4 Clip Reduction Volume", MAX98373_R20B3_BDE_L4_CFG_3, - 0, 0x3C, 0, max98373_bde_gain_tlv), + 0, 0x3C, 1, max98373_bde_gain_tlv), SOC_SINGLE_TLV("BDE LVL1 Limiter Thresh Volume", MAX98373_R20A8_BDE_L1_CFG_1, - 0, 0xF, 0, max98373_limiter_thresh_tlv), + 0, 0xF, 1, max98373_limiter_thresh_tlv), SOC_SINGLE_TLV("BDE LVL2 Limiter Thresh Volume", MAX98373_R20AB_BDE_L2_CFG_1, - 0, 0xF, 0, max98373_limiter_thresh_tlv), + 0, 0xF, 1, max98373_limiter_thresh_tlv), SOC_SINGLE_TLV("BDE LVL3 Limiter Thresh Volume", MAX98373_R20AE_BDE_L3_CFG_1, - 0, 0xF, 0, max98373_limiter_thresh_tlv), + 0, 0xF, 1, max98373_limiter_thresh_tlv), SOC_SINGLE_TLV("BDE LVL4 Limiter Thresh Volume", MAX98373_R20B1_BDE_L4_CFG_1, - 0, 0xF, 0, max98373_limiter_thresh_tlv), + 0, 0xF, 1, max98373_limiter_thresh_tlv), /* Limiter */ SOC_SINGLE("Limiter Switch", MAX98373_R20E2_LIMITER_EN, MAX98373_LIMITER_EN_SHIFT, 1, 0), diff --git a/sound/soc/codecs/nau8822.c b/sound/soc/codecs/nau8822.c new file mode 100644 index 000000000000..622ce947f134 --- /dev/null +++ b/sound/soc/codecs/nau8822.c @@ -0,0 +1,1136 @@ +/* + * nau8822.c -- NAU8822 ALSA Soc Audio Codec driver + * + * Copyright 2017 Nuvoton Technology Corp. + * + * Author: David Lin + * Co-author: John Hsu + * Co-author: Seven Li + * + * Based on WM8974.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nau8822.h" + +#define NAU_PLL_FREQ_MAX 100000000 +#define NAU_PLL_FREQ_MIN 90000000 +#define NAU_PLL_REF_MAX 33000000 +#define NAU_PLL_REF_MIN 8000000 +#define NAU_PLL_OPTOP_MIN 6 + +static const int nau8822_mclk_scaler[] = { 10, 15, 20, 30, 40, 60, 80, 120 }; + +static const struct reg_default nau8822_reg_defaults[] = { + { NAU8822_REG_POWER_MANAGEMENT_1, 0x0000 }, + { NAU8822_REG_POWER_MANAGEMENT_2, 0x0000 }, + { NAU8822_REG_POWER_MANAGEMENT_3, 0x0000 }, + { NAU8822_REG_AUDIO_INTERFACE, 0x0050 }, + { NAU8822_REG_COMPANDING_CONTROL, 0x0000 }, + { NAU8822_REG_CLOCKING, 0x0140 }, + { NAU8822_REG_ADDITIONAL_CONTROL, 0x0000 }, + { NAU8822_REG_GPIO_CONTROL, 0x0000 }, + { NAU8822_REG_JACK_DETECT_CONTROL_1, 0x0000 }, + { NAU8822_REG_DAC_CONTROL, 0x0000 }, + { NAU8822_REG_LEFT_DAC_DIGITAL_VOLUME, 0x00ff }, + { NAU8822_REG_RIGHT_DAC_DIGITAL_VOLUME, 0x00ff }, + { NAU8822_REG_JACK_DETECT_CONTROL_2, 0x0000 }, + { NAU8822_REG_ADC_CONTROL, 0x0100 }, + { NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME, 0x00ff }, + { NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME, 0x00ff }, + { NAU8822_REG_EQ1, 0x012c }, + { NAU8822_REG_EQ2, 0x002c }, + { NAU8822_REG_EQ3, 0x002c }, + { NAU8822_REG_EQ4, 0x002c }, + { NAU8822_REG_EQ5, 0x002c }, + { NAU8822_REG_DAC_LIMITER_1, 0x0032 }, + { NAU8822_REG_DAC_LIMITER_2, 0x0000 }, + { NAU8822_REG_NOTCH_FILTER_1, 0x0000 }, + { NAU8822_REG_NOTCH_FILTER_2, 0x0000 }, + { NAU8822_REG_NOTCH_FILTER_3, 0x0000 }, + { NAU8822_REG_NOTCH_FILTER_4, 0x0000 }, + { NAU8822_REG_ALC_CONTROL_1, 0x0038 }, + { NAU8822_REG_ALC_CONTROL_2, 0x000b }, + { NAU8822_REG_ALC_CONTROL_3, 0x0032 }, + { NAU8822_REG_NOISE_GATE, 0x0010 }, + { NAU8822_REG_PLL_N, 0x0008 }, + { NAU8822_REG_PLL_K1, 0x000c }, + { NAU8822_REG_PLL_K2, 0x0093 }, + { NAU8822_REG_PLL_K3, 0x00e9 }, + { NAU8822_REG_3D_CONTROL, 0x0000 }, + { NAU8822_REG_RIGHT_SPEAKER_CONTROL, 0x0000 }, + { NAU8822_REG_INPUT_CONTROL, 0x0033 }, + { NAU8822_REG_LEFT_INP_PGA_CONTROL, 0x0010 }, + { NAU8822_REG_RIGHT_INP_PGA_CONTROL, 0x0010 }, + { NAU8822_REG_LEFT_ADC_BOOST_CONTROL, 0x0100 }, + { NAU8822_REG_RIGHT_ADC_BOOST_CONTROL, 0x0100 }, + { NAU8822_REG_OUTPUT_CONTROL, 0x0002 }, + { NAU8822_REG_LEFT_MIXER_CONTROL, 0x0001 }, + { NAU8822_REG_RIGHT_MIXER_CONTROL, 0x0001 }, + { NAU8822_REG_LHP_VOLUME, 0x0039 }, + { NAU8822_REG_RHP_VOLUME, 0x0039 }, + { NAU8822_REG_LSPKOUT_VOLUME, 0x0039 }, + { NAU8822_REG_RSPKOUT_VOLUME, 0x0039 }, + { NAU8822_REG_AUX2_MIXER, 0x0001 }, + { NAU8822_REG_AUX1_MIXER, 0x0001 }, + { NAU8822_REG_POWER_MANAGEMENT_4, 0x0000 }, + { NAU8822_REG_LEFT_TIME_SLOT, 0x0000 }, + { NAU8822_REG_MISC, 0x0020 }, + { NAU8822_REG_RIGHT_TIME_SLOT, 0x0000 }, + { NAU8822_REG_DEVICE_REVISION, 0x007f }, + { NAU8822_REG_DEVICE_ID, 0x001a }, + { NAU8822_REG_DAC_DITHER, 0x0114 }, + { NAU8822_REG_ALC_ENHANCE_1, 0x0000 }, + { NAU8822_REG_ALC_ENHANCE_2, 0x0000 }, + { NAU8822_REG_192KHZ_SAMPLING, 0x0008 }, + { NAU8822_REG_MISC_CONTROL, 0x0000 }, + { NAU8822_REG_INPUT_TIEOFF, 0x0000 }, + { NAU8822_REG_POWER_REDUCTION, 0x0000 }, + { NAU8822_REG_AGC_PEAK2PEAK, 0x0000 }, + { NAU8822_REG_AGC_PEAK_DETECT, 0x0000 }, + { NAU8822_REG_AUTOMUTE_CONTROL, 0x0000 }, + { NAU8822_REG_OUTPUT_TIEOFF, 0x0000 }, +}; + +static bool nau8822_readable_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case NAU8822_REG_RESET ... NAU8822_REG_JACK_DETECT_CONTROL_1: + case NAU8822_REG_DAC_CONTROL ... NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME: + case NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME: + case NAU8822_REG_EQ1 ... NAU8822_REG_EQ5: + case NAU8822_REG_DAC_LIMITER_1 ... NAU8822_REG_DAC_LIMITER_2: + case NAU8822_REG_NOTCH_FILTER_1 ... NAU8822_REG_NOTCH_FILTER_4: + case NAU8822_REG_ALC_CONTROL_1 ...NAU8822_REG_PLL_K3: + case NAU8822_REG_3D_CONTROL: + case NAU8822_REG_RIGHT_SPEAKER_CONTROL: + case NAU8822_REG_INPUT_CONTROL ... NAU8822_REG_LEFT_ADC_BOOST_CONTROL: + case NAU8822_REG_RIGHT_ADC_BOOST_CONTROL ... NAU8822_REG_AUX1_MIXER: + case NAU8822_REG_POWER_MANAGEMENT_4 ... NAU8822_REG_DEVICE_ID: + case NAU8822_REG_DAC_DITHER: + case NAU8822_REG_ALC_ENHANCE_1 ... NAU8822_REG_MISC_CONTROL: + case NAU8822_REG_INPUT_TIEOFF ... NAU8822_REG_OUTPUT_TIEOFF: + return true; + default: + return false; + } +} + +static bool nau8822_writeable_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case NAU8822_REG_RESET ... NAU8822_REG_JACK_DETECT_CONTROL_1: + case NAU8822_REG_DAC_CONTROL ... NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME: + case NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME: + case NAU8822_REG_EQ1 ... NAU8822_REG_EQ5: + case NAU8822_REG_DAC_LIMITER_1 ... NAU8822_REG_DAC_LIMITER_2: + case NAU8822_REG_NOTCH_FILTER_1 ... NAU8822_REG_NOTCH_FILTER_4: + case NAU8822_REG_ALC_CONTROL_1 ...NAU8822_REG_PLL_K3: + case NAU8822_REG_3D_CONTROL: + case NAU8822_REG_RIGHT_SPEAKER_CONTROL: + case NAU8822_REG_INPUT_CONTROL ... NAU8822_REG_LEFT_ADC_BOOST_CONTROL: + case NAU8822_REG_RIGHT_ADC_BOOST_CONTROL ... NAU8822_REG_AUX1_MIXER: + case NAU8822_REG_POWER_MANAGEMENT_4 ... NAU8822_REG_DEVICE_ID: + case NAU8822_REG_DAC_DITHER: + case NAU8822_REG_ALC_ENHANCE_1 ... NAU8822_REG_MISC_CONTROL: + case NAU8822_REG_INPUT_TIEOFF ... NAU8822_REG_OUTPUT_TIEOFF: + return true; + default: + return false; + } +} + +static bool nau8822_volatile(struct device *dev, unsigned int reg) +{ + switch (reg) { + case NAU8822_REG_RESET: + case NAU8822_REG_DEVICE_REVISION: + case NAU8822_REG_DEVICE_ID: + case NAU8822_REG_AGC_PEAK2PEAK: + case NAU8822_REG_AGC_PEAK_DETECT: + case NAU8822_REG_AUTOMUTE_CONTROL: + return true; + default: + return false; + } +} + +/* The EQ parameters get function is to get the 5 band equalizer control. + * The regmap raw read can't work here because regmap doesn't provide + * value format for value width of 9 bits. Therefore, the driver reads data + * from cache and makes value format according to the endianness of + * bytes type control element. + */ +static int nau8822_eq_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = + snd_soc_kcontrol_component(kcontrol); + struct soc_bytes_ext *params = (void *)kcontrol->private_value; + int i, reg; + u16 reg_val, *val; + + val = (u16 *)ucontrol->value.bytes.data; + reg = NAU8822_REG_EQ1; + for (i = 0; i < params->max / sizeof(u16); i++) { + reg_val = snd_soc_component_read32(component, reg + i); + /* conversion of 16-bit integers between native CPU format + * and big endian format + */ + reg_val = cpu_to_be16(reg_val); + memcpy(val + i, ®_val, sizeof(reg_val)); + } + + return 0; +} + +/* The EQ parameters put function is to make configuration of 5 band equalizer + * control. These configuration includes central frequency, equalizer gain, + * cut-off frequency, bandwidth control, and equalizer path. + * The regmap raw write can't work here because regmap doesn't provide + * register and value format for register with address 7 bits and value 9 bits. + * Therefore, the driver makes value format according to the endianness of + * bytes type control element and writes data to codec. + */ +static int nau8822_eq_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = + snd_soc_kcontrol_component(kcontrol); + struct soc_bytes_ext *params = (void *)kcontrol->private_value; + void *data; + u16 *val, value; + int i, reg, ret; + + data = kmemdup(ucontrol->value.bytes.data, + params->max, GFP_KERNEL | GFP_DMA); + if (!data) + return -ENOMEM; + + val = (u16 *)data; + reg = NAU8822_REG_EQ1; + for (i = 0; i < params->max / sizeof(u16); i++) { + /* conversion of 16-bit integers between native CPU format + * and big endian format + */ + value = be16_to_cpu(*(val + i)); + ret = snd_soc_component_write(component, reg + i, value); + if (ret) { + dev_err(component->dev, + "EQ configuration fail, register: %x ret: %d\n", + reg + i, ret); + kfree(data); + return ret; + } + } + kfree(data); + + return 0; +} + +static const char * const nau8822_companding[] = { + "Off", "NC", "u-law", "A-law"}; + +static const struct soc_enum nau8822_companding_adc_enum = + SOC_ENUM_SINGLE(NAU8822_REG_COMPANDING_CONTROL, NAU8822_ADCCM_SFT, + ARRAY_SIZE(nau8822_companding), nau8822_companding); + +static const struct soc_enum nau8822_companding_dac_enum = + SOC_ENUM_SINGLE(NAU8822_REG_COMPANDING_CONTROL, NAU8822_DACCM_SFT, + ARRAY_SIZE(nau8822_companding), nau8822_companding); + +static const char * const nau8822_eqmode[] = {"Capture", "Playback"}; + +static const struct soc_enum nau8822_eqmode_enum = + SOC_ENUM_SINGLE(NAU8822_REG_EQ1, NAU8822_EQM_SFT, + ARRAY_SIZE(nau8822_eqmode), nau8822_eqmode); + +static const char * const nau8822_alc1[] = {"Off", "Right", "Left", "Both"}; +static const char * const nau8822_alc3[] = {"Normal", "Limiter"}; + +static const struct soc_enum nau8822_alc_enable_enum = + SOC_ENUM_SINGLE(NAU8822_REG_ALC_CONTROL_1, NAU8822_ALCEN_SFT, + ARRAY_SIZE(nau8822_alc1), nau8822_alc1); + +static const struct soc_enum nau8822_alc_mode_enum = + SOC_ENUM_SINGLE(NAU8822_REG_ALC_CONTROL_3, NAU8822_ALCM_SFT, + ARRAY_SIZE(nau8822_alc3), nau8822_alc3); + +static const DECLARE_TLV_DB_SCALE(digital_tlv, -12750, 50, 1); +static const DECLARE_TLV_DB_SCALE(inpga_tlv, -1200, 75, 0); +static const DECLARE_TLV_DB_SCALE(spk_tlv, -5700, 100, 0); +static const DECLARE_TLV_DB_SCALE(pga_boost_tlv, 0, 2000, 0); +static const DECLARE_TLV_DB_SCALE(boost_tlv, -1500, 300, 1); +static const DECLARE_TLV_DB_SCALE(limiter_tlv, 0, 100, 0); + +static const struct snd_kcontrol_new nau8822_snd_controls[] = { + SOC_ENUM("ADC Companding", nau8822_companding_adc_enum), + SOC_ENUM("DAC Companding", nau8822_companding_dac_enum), + + SOC_ENUM("EQ Function", nau8822_eqmode_enum), + SND_SOC_BYTES_EXT("EQ Parameters", 10, + nau8822_eq_get, nau8822_eq_put), + + SOC_DOUBLE("DAC Inversion Switch", + NAU8822_REG_DAC_CONTROL, 0, 1, 1, 0), + SOC_DOUBLE_R_TLV("PCM Volume", + NAU8822_REG_LEFT_DAC_DIGITAL_VOLUME, + NAU8822_REG_RIGHT_DAC_DIGITAL_VOLUME, 0, 255, 0, digital_tlv), + + SOC_SINGLE("High Pass Filter Switch", + NAU8822_REG_ADC_CONTROL, 8, 1, 0), + SOC_SINGLE("High Pass Cut Off", + NAU8822_REG_ADC_CONTROL, 4, 7, 0), + + SOC_DOUBLE("ADC Inversion Switch", + NAU8822_REG_ADC_CONTROL, 0, 1, 1, 0), + SOC_DOUBLE_R_TLV("ADC Volume", + NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME, + NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME, 0, 255, 0, digital_tlv), + + SOC_SINGLE("DAC Limiter Switch", + NAU8822_REG_DAC_LIMITER_1, 8, 1, 0), + SOC_SINGLE("DAC Limiter Decay", + NAU8822_REG_DAC_LIMITER_1, 4, 15, 0), + SOC_SINGLE("DAC Limiter Attack", + NAU8822_REG_DAC_LIMITER_1, 0, 15, 0), + SOC_SINGLE("DAC Limiter Threshold", + NAU8822_REG_DAC_LIMITER_2, 4, 7, 0), + SOC_SINGLE_TLV("DAC Limiter Volume", + NAU8822_REG_DAC_LIMITER_2, 0, 12, 0, limiter_tlv), + + SOC_ENUM("ALC Mode", nau8822_alc_mode_enum), + SOC_ENUM("ALC Enable Switch", nau8822_alc_enable_enum), + SOC_SINGLE("ALC Min Gain", + NAU8822_REG_ALC_CONTROL_1, 0, 7, 0), + SOC_SINGLE("ALC Max Gain", + NAU8822_REG_ALC_CONTROL_1, 3, 7, 0), + SOC_SINGLE("ALC Hold", + NAU8822_REG_ALC_CONTROL_2, 4, 10, 0), + SOC_SINGLE("ALC Target", + NAU8822_REG_ALC_CONTROL_2, 0, 15, 0), + SOC_SINGLE("ALC Decay", + NAU8822_REG_ALC_CONTROL_3, 4, 10, 0), + SOC_SINGLE("ALC Attack", + NAU8822_REG_ALC_CONTROL_3, 0, 10, 0), + SOC_SINGLE("ALC Noise Gate Switch", + NAU8822_REG_NOISE_GATE, 3, 1, 0), + SOC_SINGLE("ALC Noise Gate Threshold", + NAU8822_REG_NOISE_GATE, 0, 7, 0), + + SOC_DOUBLE_R("PGA ZC Switch", + NAU8822_REG_LEFT_INP_PGA_CONTROL, + NAU8822_REG_RIGHT_INP_PGA_CONTROL, + 7, 1, 0), + SOC_DOUBLE_R_TLV("PGA Volume", + NAU8822_REG_LEFT_INP_PGA_CONTROL, + NAU8822_REG_RIGHT_INP_PGA_CONTROL, 0, 63, 0, inpga_tlv), + + SOC_DOUBLE_R("Headphone ZC Switch", + NAU8822_REG_LHP_VOLUME, + NAU8822_REG_RHP_VOLUME, 7, 1, 0), + SOC_DOUBLE_R("Headphone Playback Switch", + NAU8822_REG_LHP_VOLUME, + NAU8822_REG_RHP_VOLUME, 6, 1, 1), + SOC_DOUBLE_R_TLV("Headphone Volume", + NAU8822_REG_LHP_VOLUME, + NAU8822_REG_RHP_VOLUME, 0, 63, 0, spk_tlv), + + SOC_DOUBLE_R("Speaker ZC Switch", + NAU8822_REG_LSPKOUT_VOLUME, + NAU8822_REG_RSPKOUT_VOLUME, 7, 1, 0), + SOC_DOUBLE_R("Speaker Playback Switch", + NAU8822_REG_LSPKOUT_VOLUME, + NAU8822_REG_RSPKOUT_VOLUME, 6, 1, 1), + SOC_DOUBLE_R_TLV("Speaker Volume", + NAU8822_REG_LSPKOUT_VOLUME, + NAU8822_REG_RSPKOUT_VOLUME, 0, 63, 0, spk_tlv), + + SOC_DOUBLE_R("AUXOUT Playback Switch", + NAU8822_REG_AUX2_MIXER, + NAU8822_REG_AUX1_MIXER, 6, 1, 1), + + SOC_DOUBLE_R_TLV("PGA Boost Volume", + NAU8822_REG_LEFT_ADC_BOOST_CONTROL, + NAU8822_REG_RIGHT_ADC_BOOST_CONTROL, 8, 1, 0, pga_boost_tlv), + SOC_DOUBLE_R_TLV("L2/R2 Boost Volume", + NAU8822_REG_LEFT_ADC_BOOST_CONTROL, + NAU8822_REG_RIGHT_ADC_BOOST_CONTROL, 4, 7, 0, boost_tlv), + SOC_DOUBLE_R_TLV("Aux Boost Volume", + NAU8822_REG_LEFT_ADC_BOOST_CONTROL, + NAU8822_REG_RIGHT_ADC_BOOST_CONTROL, 0, 7, 0, boost_tlv), + + SOC_SINGLE("DAC 128x Oversampling Switch", + NAU8822_REG_DAC_CONTROL, 5, 1, 0), + SOC_SINGLE("ADC 128x Oversampling Switch", + NAU8822_REG_ADC_CONTROL, 5, 1, 0), +}; + +/* LMAIN and RMAIN Mixer */ +static const struct snd_kcontrol_new nau8822_left_out_mixer[] = { + SOC_DAPM_SINGLE("LINMIX Switch", + NAU8822_REG_LEFT_MIXER_CONTROL, 1, 1, 0), + SOC_DAPM_SINGLE("LAUX Switch", + NAU8822_REG_LEFT_MIXER_CONTROL, 5, 1, 0), + SOC_DAPM_SINGLE("LDAC Switch", + NAU8822_REG_LEFT_MIXER_CONTROL, 0, 1, 0), + SOC_DAPM_SINGLE("RDAC Switch", + NAU8822_REG_OUTPUT_CONTROL, 5, 1, 0), +}; + +static const struct snd_kcontrol_new nau8822_right_out_mixer[] = { + SOC_DAPM_SINGLE("RINMIX Switch", + NAU8822_REG_RIGHT_MIXER_CONTROL, 1, 1, 0), + SOC_DAPM_SINGLE("RAUX Switch", + NAU8822_REG_RIGHT_MIXER_CONTROL, 5, 1, 0), + SOC_DAPM_SINGLE("RDAC Switch", + NAU8822_REG_RIGHT_MIXER_CONTROL, 0, 1, 0), + SOC_DAPM_SINGLE("LDAC Switch", + NAU8822_REG_OUTPUT_CONTROL, 6, 1, 0), +}; + +/* AUX1 and AUX2 Mixer */ +static const struct snd_kcontrol_new nau8822_auxout1_mixer[] = { + SOC_DAPM_SINGLE("RDAC Switch", NAU8822_REG_AUX1_MIXER, 0, 1, 0), + SOC_DAPM_SINGLE("RMIX Switch", NAU8822_REG_AUX1_MIXER, 1, 1, 0), + SOC_DAPM_SINGLE("RINMIX Switch", NAU8822_REG_AUX1_MIXER, 2, 1, 0), + SOC_DAPM_SINGLE("LDAC Switch", NAU8822_REG_AUX1_MIXER, 3, 1, 0), + SOC_DAPM_SINGLE("LMIX Switch", NAU8822_REG_AUX1_MIXER, 4, 1, 0), +}; + +static const struct snd_kcontrol_new nau8822_auxout2_mixer[] = { + SOC_DAPM_SINGLE("LDAC Switch", NAU8822_REG_AUX2_MIXER, 0, 1, 0), + SOC_DAPM_SINGLE("LMIX Switch", NAU8822_REG_AUX2_MIXER, 1, 1, 0), + SOC_DAPM_SINGLE("LINMIX Switch", NAU8822_REG_AUX2_MIXER, 2, 1, 0), + SOC_DAPM_SINGLE("AUX1MIX Output Switch", + NAU8822_REG_AUX2_MIXER, 3, 1, 0), +}; + +/* Input PGA */ +static const struct snd_kcontrol_new nau8822_left_input_mixer[] = { + SOC_DAPM_SINGLE("L2 Switch", NAU8822_REG_INPUT_CONTROL, 2, 1, 0), + SOC_DAPM_SINGLE("MicN Switch", NAU8822_REG_INPUT_CONTROL, 1, 1, 0), + SOC_DAPM_SINGLE("MicP Switch", NAU8822_REG_INPUT_CONTROL, 0, 1, 0), +}; +static const struct snd_kcontrol_new nau8822_right_input_mixer[] = { + SOC_DAPM_SINGLE("R2 Switch", NAU8822_REG_INPUT_CONTROL, 6, 1, 0), + SOC_DAPM_SINGLE("MicN Switch", NAU8822_REG_INPUT_CONTROL, 5, 1, 0), + SOC_DAPM_SINGLE("MicP Switch", NAU8822_REG_INPUT_CONTROL, 4, 1, 0), +}; + +/* Loopback Switch */ +static const struct snd_kcontrol_new nau8822_loopback = + SOC_DAPM_SINGLE("Switch", NAU8822_REG_COMPANDING_CONTROL, + NAU8822_ADDAP_SFT, 1, 0); + +static int check_mclk_select_pll(struct snd_soc_dapm_widget *source, + struct snd_soc_dapm_widget *sink) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(source->dapm); + unsigned int value; + + value = snd_soc_component_read32(component, NAU8822_REG_CLOCKING); + + return (value & NAU8822_CLKM_MASK); +} + +static const struct snd_soc_dapm_widget nau8822_dapm_widgets[] = { + SND_SOC_DAPM_DAC("Left DAC", "Left HiFi Playback", + NAU8822_REG_POWER_MANAGEMENT_3, 0, 0), + SND_SOC_DAPM_DAC("Right DAC", "Right HiFi Playback", + NAU8822_REG_POWER_MANAGEMENT_3, 1, 0), + SND_SOC_DAPM_ADC("Left ADC", "Left HiFi Capture", + NAU8822_REG_POWER_MANAGEMENT_2, 0, 0), + SND_SOC_DAPM_ADC("Right ADC", "Right HiFi Capture", + NAU8822_REG_POWER_MANAGEMENT_2, 1, 0), + + SOC_MIXER_ARRAY("Left Output Mixer", + NAU8822_REG_POWER_MANAGEMENT_3, 2, 0, nau8822_left_out_mixer), + SOC_MIXER_ARRAY("Right Output Mixer", + NAU8822_REG_POWER_MANAGEMENT_3, 3, 0, nau8822_right_out_mixer), + SOC_MIXER_ARRAY("AUX1 Output Mixer", + NAU8822_REG_POWER_MANAGEMENT_1, 7, 0, nau8822_auxout1_mixer), + SOC_MIXER_ARRAY("AUX2 Output Mixer", + NAU8822_REG_POWER_MANAGEMENT_1, 6, 0, nau8822_auxout2_mixer), + + SOC_MIXER_ARRAY("Left Input Mixer", + NAU8822_REG_POWER_MANAGEMENT_2, + 2, 0, nau8822_left_input_mixer), + SOC_MIXER_ARRAY("Right Input Mixer", + NAU8822_REG_POWER_MANAGEMENT_2, + 3, 0, nau8822_right_input_mixer), + + SND_SOC_DAPM_PGA("Left Boost Mixer", + NAU8822_REG_POWER_MANAGEMENT_2, 4, 0, NULL, 0), + SND_SOC_DAPM_PGA("Right Boost Mixer", + NAU8822_REG_POWER_MANAGEMENT_2, 5, 0, NULL, 0), + + SND_SOC_DAPM_PGA("Left Capture PGA", + NAU8822_REG_LEFT_INP_PGA_CONTROL, 6, 1, NULL, 0), + SND_SOC_DAPM_PGA("Right Capture PGA", + NAU8822_REG_RIGHT_INP_PGA_CONTROL, 6, 1, NULL, 0), + + SND_SOC_DAPM_PGA("Left Headphone Out", + NAU8822_REG_POWER_MANAGEMENT_2, 7, 0, NULL, 0), + SND_SOC_DAPM_PGA("Right Headphone Out", + NAU8822_REG_POWER_MANAGEMENT_2, 8, 0, NULL, 0), + + SND_SOC_DAPM_PGA("Left Speaker Out", + NAU8822_REG_POWER_MANAGEMENT_3, 6, 0, NULL, 0), + SND_SOC_DAPM_PGA("Right Speaker Out", + NAU8822_REG_POWER_MANAGEMENT_3, 5, 0, NULL, 0), + + SND_SOC_DAPM_PGA("AUX1 Out", + NAU8822_REG_POWER_MANAGEMENT_3, 8, 0, NULL, 0), + SND_SOC_DAPM_PGA("AUX2 Out", + NAU8822_REG_POWER_MANAGEMENT_3, 7, 0, NULL, 0), + + SND_SOC_DAPM_SUPPLY("Mic Bias", + NAU8822_REG_POWER_MANAGEMENT_1, 4, 0, NULL, 0), + SND_SOC_DAPM_SUPPLY("PLL", + NAU8822_REG_POWER_MANAGEMENT_1, 5, 0, NULL, 0), + + SND_SOC_DAPM_SWITCH("Digital Loopback", SND_SOC_NOPM, 0, 0, + &nau8822_loopback), + + SND_SOC_DAPM_INPUT("LMICN"), + SND_SOC_DAPM_INPUT("LMICP"), + SND_SOC_DAPM_INPUT("RMICN"), + SND_SOC_DAPM_INPUT("RMICP"), + SND_SOC_DAPM_INPUT("LAUX"), + SND_SOC_DAPM_INPUT("RAUX"), + SND_SOC_DAPM_INPUT("L2"), + SND_SOC_DAPM_INPUT("R2"), + SND_SOC_DAPM_OUTPUT("LHP"), + SND_SOC_DAPM_OUTPUT("RHP"), + SND_SOC_DAPM_OUTPUT("LSPK"), + SND_SOC_DAPM_OUTPUT("RSPK"), + SND_SOC_DAPM_OUTPUT("AUXOUT1"), + SND_SOC_DAPM_OUTPUT("AUXOUT2"), +}; + +static const struct snd_soc_dapm_route nau8822_dapm_routes[] = { + {"Right DAC", NULL, "PLL", check_mclk_select_pll}, + {"Left DAC", NULL, "PLL", check_mclk_select_pll}, + + /* LMAIN and RMAIN Mixer */ + {"Right Output Mixer", "LDAC Switch", "Left DAC"}, + {"Right Output Mixer", "RDAC Switch", "Right DAC"}, + {"Right Output Mixer", "RAUX Switch", "RAUX"}, + {"Right Output Mixer", "RINMIX Switch", "Right Boost Mixer"}, + + {"Left Output Mixer", "LDAC Switch", "Left DAC"}, + {"Left Output Mixer", "RDAC Switch", "Right DAC"}, + {"Left Output Mixer", "LAUX Switch", "LAUX"}, + {"Left Output Mixer", "LINMIX Switch", "Left Boost Mixer"}, + + /* AUX1 and AUX2 Mixer */ + {"AUX1 Output Mixer", "RDAC Switch", "Right DAC"}, + {"AUX1 Output Mixer", "RMIX Switch", "Right Output Mixer"}, + {"AUX1 Output Mixer", "RINMIX Switch", "Right Boost Mixer"}, + {"AUX1 Output Mixer", "LDAC Switch", "Left DAC"}, + {"AUX1 Output Mixer", "LMIX Switch", "Left Output Mixer"}, + + {"AUX2 Output Mixer", "LDAC Switch", "Left DAC"}, + {"AUX2 Output Mixer", "LMIX Switch", "Left Output Mixer"}, + {"AUX2 Output Mixer", "LINMIX Switch", "Left Boost Mixer"}, + {"AUX2 Output Mixer", "AUX1MIX Output Switch", "AUX1 Output Mixer"}, + + /* Outputs */ + {"Right Headphone Out", NULL, "Right Output Mixer"}, + {"RHP", NULL, "Right Headphone Out"}, + + {"Left Headphone Out", NULL, "Left Output Mixer"}, + {"LHP", NULL, "Left Headphone Out"}, + + {"Right Speaker Out", NULL, "Right Output Mixer"}, + {"RSPK", NULL, "Right Speaker Out"}, + + {"Left Speaker Out", NULL, "Left Output Mixer"}, + {"LSPK", NULL, "Left Speaker Out"}, + + {"AUX1 Out", NULL, "AUX1 Output Mixer"}, + {"AUX2 Out", NULL, "AUX2 Output Mixer"}, + {"AUXOUT1", NULL, "AUX1 Out"}, + {"AUXOUT2", NULL, "AUX2 Out"}, + + /* Boost Mixer */ + {"Right ADC", NULL, "PLL", check_mclk_select_pll}, + {"Left ADC", NULL, "PLL", check_mclk_select_pll}, + + {"Right ADC", NULL, "Right Boost Mixer"}, + + {"Right Boost Mixer", NULL, "RAUX"}, + {"Right Boost Mixer", NULL, "Right Capture PGA"}, + {"Right Boost Mixer", NULL, "R2"}, + + {"Left ADC", NULL, "Left Boost Mixer"}, + + {"Left Boost Mixer", NULL, "LAUX"}, + {"Left Boost Mixer", NULL, "Left Capture PGA"}, + {"Left Boost Mixer", NULL, "L2"}, + + /* Input PGA */ + {"Right Capture PGA", NULL, "Right Input Mixer"}, + {"Left Capture PGA", NULL, "Left Input Mixer"}, + + /* Enable Microphone Power */ + {"Right Capture PGA", NULL, "Mic Bias"}, + {"Left Capture PGA", NULL, "Mic Bias"}, + + {"Right Input Mixer", "R2 Switch", "R2"}, + {"Right Input Mixer", "MicN Switch", "RMICN"}, + {"Right Input Mixer", "MicP Switch", "RMICP"}, + + {"Left Input Mixer", "L2 Switch", "L2"}, + {"Left Input Mixer", "MicN Switch", "LMICN"}, + {"Left Input Mixer", "MicP Switch", "LMICP"}, + + /* Digital Loopback */ + {"Digital Loopback", "Switch", "Left ADC"}, + {"Digital Loopback", "Switch", "Right ADC"}, + {"Left DAC", NULL, "Digital Loopback"}, + {"Right DAC", NULL, "Digital Loopback"}, +}; + +static int nau8822_set_dai_sysclk(struct snd_soc_dai *dai, int clk_id, + unsigned int freq, int dir) +{ + struct snd_soc_component *component = dai->component; + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + + nau8822->div_id = clk_id; + nau8822->sysclk = freq; + dev_dbg(component->dev, "master sysclk %dHz, source %s\n", freq, + clk_id == NAU8822_CLK_PLL ? "PLL" : "MCLK"); + + return 0; +} + +static int nau8822_calc_pll(unsigned int pll_in, unsigned int fs, + struct nau8822_pll *pll_param) +{ + u64 f2, f2_max, pll_ratio; + int i, scal_sel; + + if (pll_in > NAU_PLL_REF_MAX || pll_in < NAU_PLL_REF_MIN) + return -EINVAL; + f2_max = 0; + scal_sel = ARRAY_SIZE(nau8822_mclk_scaler); + + for (i = 0; i < scal_sel; i++) { + f2 = 256 * fs * 4 * nau8822_mclk_scaler[i] / 10; + if (f2 > NAU_PLL_FREQ_MIN && f2 < NAU_PLL_FREQ_MAX && + f2_max < f2) { + f2_max = f2; + scal_sel = i; + } + } + + if (ARRAY_SIZE(nau8822_mclk_scaler) == scal_sel) + return -EINVAL; + pll_param->mclk_scaler = scal_sel; + f2 = f2_max; + + /* Calculate the PLL 4-bit integer input and the PLL 24-bit fractional + * input; round up the 24+4bit. + */ + pll_ratio = div_u64(f2 << 28, pll_in); + pll_param->pre_factor = 0; + if (((pll_ratio >> 28) & 0xF) < NAU_PLL_OPTOP_MIN) { + pll_ratio <<= 1; + pll_param->pre_factor = 1; + } + pll_param->pll_int = (pll_ratio >> 28) & 0xF; + pll_param->pll_frac = ((pll_ratio & 0xFFFFFFF) >> 4); + + return 0; +} + +static int nau8822_config_clkdiv(struct snd_soc_dai *dai, int div, int rate) +{ + struct snd_soc_component *component = dai->component; + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + struct nau8822_pll *pll = &nau8822->pll; + int i, sclk, imclk; + + switch (nau8822->div_id) { + case NAU8822_CLK_MCLK: + /* Configure the master clock prescaler div to make system + * clock to approximate the internal master clock (IMCLK); + * and large or equal to IMCLK. + */ + div = 0; + imclk = rate * 256; + for (i = 1; i < ARRAY_SIZE(nau8822_mclk_scaler); i++) { + sclk = (nau8822->sysclk * 10) / nau8822_mclk_scaler[i]; + if (sclk < imclk) + break; + div = i; + } + dev_dbg(component->dev, "master clock prescaler %x for fs %d\n", + div, rate); + + /* master clock from MCLK and disable PLL */ + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_MCLKSEL_MASK, + (div << NAU8822_MCLKSEL_SFT)); + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_CLKM_MASK, + NAU8822_CLKM_MCLK); + break; + + case NAU8822_CLK_PLL: + /* master clock from PLL and enable PLL */ + if (pll->mclk_scaler != div) { + dev_err(component->dev, + "master clock prescaler not meet PLL parameters\n"); + return -EINVAL; + } + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_MCLKSEL_MASK, + (div << NAU8822_MCLKSEL_SFT)); + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_CLKM_MASK, + NAU8822_CLKM_PLL); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int nau8822_set_pll(struct snd_soc_dai *dai, int pll_id, int source, + unsigned int freq_in, unsigned int freq_out) +{ + struct snd_soc_component *component = dai->component; + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + struct nau8822_pll *pll_param = &nau8822->pll; + int ret, fs; + + fs = freq_out / 256; + + ret = nau8822_calc_pll(freq_in, fs, pll_param); + if (ret < 0) { + dev_err(component->dev, "Unsupported input clock %d\n", + freq_in); + return ret; + } + + dev_info(component->dev, + "pll_int=%x pll_frac=%x mclk_scaler=%x pre_factor=%x\n", + pll_param->pll_int, pll_param->pll_frac, + pll_param->mclk_scaler, pll_param->pre_factor); + + snd_soc_component_update_bits(component, + NAU8822_REG_PLL_N, NAU8822_PLLMCLK_DIV2 | NAU8822_PLLN_MASK, + (pll_param->pre_factor ? NAU8822_PLLMCLK_DIV2 : 0) | + pll_param->pll_int); + snd_soc_component_write(component, + NAU8822_REG_PLL_K1, (pll_param->pll_frac >> NAU8822_PLLK1_SFT) & + NAU8822_PLLK1_MASK); + snd_soc_component_write(component, + NAU8822_REG_PLL_K2, (pll_param->pll_frac >> NAU8822_PLLK2_SFT) & + NAU8822_PLLK2_MASK); + snd_soc_component_write(component, + NAU8822_REG_PLL_K3, pll_param->pll_frac & NAU8822_PLLK3_MASK); + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_MCLKSEL_MASK, + pll_param->mclk_scaler << NAU8822_MCLKSEL_SFT); + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_CLKM_MASK, NAU8822_CLKM_PLL); + + return 0; +} + +static int nau8822_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt) +{ + struct snd_soc_component *component = dai->component; + u16 ctrl1_val = 0, ctrl2_val = 0; + + dev_dbg(component->dev, "%s\n", __func__); + + switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { + case SND_SOC_DAIFMT_CBM_CFM: + ctrl2_val |= 1; + break; + case SND_SOC_DAIFMT_CBS_CFS: + ctrl2_val &= ~1; + break; + default: + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_I2S: + ctrl1_val |= 0x10; + break; + case SND_SOC_DAIFMT_RIGHT_J: + break; + case SND_SOC_DAIFMT_LEFT_J: + ctrl1_val |= 0x8; + break; + case SND_SOC_DAIFMT_DSP_A: + ctrl1_val |= 0x18; + break; + default: + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_INV_MASK) { + case SND_SOC_DAIFMT_NB_NF: + break; + case SND_SOC_DAIFMT_IB_IF: + ctrl1_val |= 0x180; + break; + case SND_SOC_DAIFMT_IB_NF: + ctrl1_val |= 0x100; + break; + case SND_SOC_DAIFMT_NB_IF: + ctrl1_val |= 0x80; + break; + default: + return -EINVAL; + } + + snd_soc_component_update_bits(component, + NAU8822_REG_AUDIO_INTERFACE, + NAU8822_AIFMT_MASK | NAU8822_LRP_MASK | NAU8822_BCLKP_MASK, + ctrl1_val); + snd_soc_component_update_bits(component, + NAU8822_REG_CLOCKING, NAU8822_CLKIOEN_MASK, ctrl2_val); + + return 0; +} + +static int nau8822_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + int val_len = 0, val_rate = 0; + + switch (params_format(params)) { + case SNDRV_PCM_FORMAT_S16_LE: + break; + case SNDRV_PCM_FORMAT_S20_3LE: + val_len |= NAU8822_WLEN_20; + break; + case SNDRV_PCM_FORMAT_S24_LE: + val_len |= NAU8822_WLEN_24; + break; + case SNDRV_PCM_FORMAT_S32_LE: + val_len |= NAU8822_WLEN_32; + break; + default: + return -EINVAL; + } + + switch (params_rate(params)) { + case 8000: + val_rate |= NAU8822_SMPLR_8K; + break; + case 11025: + val_rate |= NAU8822_SMPLR_12K; + break; + case 16000: + val_rate |= NAU8822_SMPLR_16K; + break; + case 22050: + val_rate |= NAU8822_SMPLR_24K; + break; + case 32000: + val_rate |= NAU8822_SMPLR_32K; + break; + case 44100: + case 48000: + break; + default: + return -EINVAL; + } + + snd_soc_component_update_bits(component, + NAU8822_REG_AUDIO_INTERFACE, NAU8822_WLEN_MASK, val_len); + snd_soc_component_update_bits(component, + NAU8822_REG_ADDITIONAL_CONTROL, NAU8822_SMPLR_MASK, val_rate); + + /* If the master clock is from MCLK, provide the runtime FS for driver + * to get the master clock prescaler configuration. + */ + if (nau8822->div_id == NAU8822_CLK_MCLK) + nau8822_config_clkdiv(dai, 0, params_rate(params)); + + return 0; +} + +static int nau8822_mute(struct snd_soc_dai *dai, int mute) +{ + struct snd_soc_component *component = dai->component; + + dev_dbg(component->dev, "%s: %d\n", __func__, mute); + + if (mute) + snd_soc_component_update_bits(component, + NAU8822_REG_DAC_CONTROL, 0x40, 0x40); + else + snd_soc_component_update_bits(component, + NAU8822_REG_DAC_CONTROL, 0x40, 0); + + return 0; +} + +static int nau8822_set_bias_level(struct snd_soc_component *component, + enum snd_soc_bias_level level) +{ + switch (level) { + case SND_SOC_BIAS_ON: + case SND_SOC_BIAS_PREPARE: + snd_soc_component_update_bits(component, + NAU8822_REG_POWER_MANAGEMENT_1, + NAU8822_REFIMP_MASK, NAU8822_REFIMP_80K); + break; + + case SND_SOC_BIAS_STANDBY: + snd_soc_component_update_bits(component, + NAU8822_REG_POWER_MANAGEMENT_1, + NAU8822_IOBUF_EN | NAU8822_ABIAS_EN, + NAU8822_IOBUF_EN | NAU8822_ABIAS_EN); + + if (snd_soc_component_get_bias_level(component) == + SND_SOC_BIAS_OFF) { + snd_soc_component_update_bits(component, + NAU8822_REG_POWER_MANAGEMENT_1, + NAU8822_REFIMP_MASK, NAU8822_REFIMP_3K); + mdelay(100); + } + snd_soc_component_update_bits(component, + NAU8822_REG_POWER_MANAGEMENT_1, + NAU8822_REFIMP_MASK, NAU8822_REFIMP_300K); + break; + + case SND_SOC_BIAS_OFF: + snd_soc_component_write(component, + NAU8822_REG_POWER_MANAGEMENT_1, 0); + snd_soc_component_write(component, + NAU8822_REG_POWER_MANAGEMENT_2, 0); + snd_soc_component_write(component, + NAU8822_REG_POWER_MANAGEMENT_3, 0); + break; + } + + dev_dbg(component->dev, "%s: %d\n", __func__, level); + + return 0; +} + +#define NAU8822_RATES (SNDRV_PCM_RATE_8000_48000) + +#define NAU8822_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE | \ + SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32_LE) + +static const struct snd_soc_dai_ops nau8822_dai_ops = { + .hw_params = nau8822_hw_params, + .digital_mute = nau8822_mute, + .set_fmt = nau8822_set_dai_fmt, + .set_sysclk = nau8822_set_dai_sysclk, + .set_pll = nau8822_set_pll, +}; + +static struct snd_soc_dai_driver nau8822_dai = { + .name = "nau8822-hifi", + .playback = { + .stream_name = "Playback", + .channels_min = 1, + .channels_max = 2, + .rates = NAU8822_RATES, + .formats = NAU8822_FORMATS, + }, + .capture = { + .stream_name = "Capture", + .channels_min = 1, + .channels_max = 2, + .rates = NAU8822_RATES, + .formats = NAU8822_FORMATS, + }, + .ops = &nau8822_dai_ops, + .symmetric_rates = 1, +}; + +static int nau8822_suspend(struct snd_soc_component *component) +{ + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + + snd_soc_component_force_bias_level(component, SND_SOC_BIAS_OFF); + + regcache_mark_dirty(nau8822->regmap); + + return 0; +} + +static int nau8822_resume(struct snd_soc_component *component) +{ + struct nau8822 *nau8822 = snd_soc_component_get_drvdata(component); + + regcache_sync(nau8822->regmap); + + snd_soc_component_force_bias_level(component, SND_SOC_BIAS_STANDBY); + + return 0; +} + +/* + * These registers contain an "update" bit - bit 8. This means, for example, + * that one can write new DAC digital volume for both channels, but only when + * the update bit is set, will also the volume be updated - simultaneously for + * both channels. + */ +static const int update_reg[] = { + NAU8822_REG_LEFT_DAC_DIGITAL_VOLUME, + NAU8822_REG_RIGHT_DAC_DIGITAL_VOLUME, + NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME, + NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME, + NAU8822_REG_LEFT_INP_PGA_CONTROL, + NAU8822_REG_RIGHT_INP_PGA_CONTROL, + NAU8822_REG_LHP_VOLUME, + NAU8822_REG_RHP_VOLUME, + NAU8822_REG_LSPKOUT_VOLUME, + NAU8822_REG_RSPKOUT_VOLUME, +}; + +static int nau8822_probe(struct snd_soc_component *component) +{ + int i; + + /* + * Set the update bit in all registers, that have one. This way all + * writes to those registers will also cause the update bit to be + * written. + */ + for (i = 0; i < ARRAY_SIZE(update_reg); i++) + snd_soc_component_update_bits(component, + update_reg[i], 0x100, 0x100); + + return 0; +} + +static const struct snd_soc_component_driver soc_component_dev_nau8822 = { + .probe = nau8822_probe, + .suspend = nau8822_suspend, + .resume = nau8822_resume, + .set_bias_level = nau8822_set_bias_level, + .controls = nau8822_snd_controls, + .num_controls = ARRAY_SIZE(nau8822_snd_controls), + .dapm_widgets = nau8822_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(nau8822_dapm_widgets), + .dapm_routes = nau8822_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(nau8822_dapm_routes), + .idle_bias_on = 1, + .use_pmdown_time = 1, + .endianness = 1, + .non_legacy_dai_naming = 1, +}; + +static const struct regmap_config nau8822_regmap_config = { + .reg_bits = 7, + .val_bits = 9, + + .max_register = NAU8822_REG_MAX_REGISTER, + .volatile_reg = nau8822_volatile, + + .readable_reg = nau8822_readable_reg, + .writeable_reg = nau8822_writeable_reg, + + .cache_type = REGCACHE_RBTREE, + .reg_defaults = nau8822_reg_defaults, + .num_reg_defaults = ARRAY_SIZE(nau8822_reg_defaults), +}; + +static int nau8822_i2c_probe(struct i2c_client *i2c, + const struct i2c_device_id *id) +{ + struct device *dev = &i2c->dev; + struct nau8822 *nau8822 = dev_get_platdata(dev); + int ret; + + if (!nau8822) { + nau8822 = devm_kzalloc(dev, sizeof(*nau8822), GFP_KERNEL); + if (nau8822 == NULL) + return -ENOMEM; + } + i2c_set_clientdata(i2c, nau8822); + + nau8822->regmap = devm_regmap_init_i2c(i2c, &nau8822_regmap_config); + if (IS_ERR(nau8822->regmap)) { + ret = PTR_ERR(nau8822->regmap); + dev_err(&i2c->dev, "Failed to allocate regmap: %d\n", ret); + return ret; + } + nau8822->dev = dev; + + /* Reset the codec */ + ret = regmap_write(nau8822->regmap, NAU8822_REG_RESET, 0x00); + if (ret != 0) { + dev_err(&i2c->dev, "Failed to issue reset: %d\n", ret); + return ret; + } + + ret = devm_snd_soc_register_component(dev, &soc_component_dev_nau8822, + &nau8822_dai, 1); + if (ret != 0) { + dev_err(&i2c->dev, "Failed to register CODEC: %d\n", ret); + return ret; + } + + return 0; +} + +static const struct i2c_device_id nau8822_i2c_id[] = { + { "nau8822", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, nau8822_i2c_id); + +#ifdef CONFIG_OF +static const struct of_device_id nau8822_of_match[] = { + { .compatible = "nuvoton,nau8822", }, + { } +}; +MODULE_DEVICE_TABLE(of, nau8822_of_match); +#endif + +static struct i2c_driver nau8822_i2c_driver = { + .driver = { + .name = "nau8822", + .of_match_table = of_match_ptr(nau8822_of_match), + }, + .probe = nau8822_i2c_probe, + .id_table = nau8822_i2c_id, +}; +module_i2c_driver(nau8822_i2c_driver); + +MODULE_DESCRIPTION("ASoC NAU8822 codec driver"); +MODULE_AUTHOR("David Lin "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/codecs/nau8822.h b/sound/soc/codecs/nau8822.h new file mode 100644 index 000000000000..aa79c969cd44 --- /dev/null +++ b/sound/soc/codecs/nau8822.h @@ -0,0 +1,204 @@ +/* + * nau8822.h -- NAU8822 Soc Audio Codec driver + * + * Author: David Lin + * Co-author: John Hsu + * Co-author: Seven Li + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __NAU8822_H__ +#define __NAU8822_H__ + +#define NAU8822_REG_RESET 0x00 +#define NAU8822_REG_POWER_MANAGEMENT_1 0x01 +#define NAU8822_REG_POWER_MANAGEMENT_2 0x02 +#define NAU8822_REG_POWER_MANAGEMENT_3 0x03 +#define NAU8822_REG_AUDIO_INTERFACE 0x04 +#define NAU8822_REG_COMPANDING_CONTROL 0x05 +#define NAU8822_REG_CLOCKING 0x06 +#define NAU8822_REG_ADDITIONAL_CONTROL 0x07 +#define NAU8822_REG_GPIO_CONTROL 0x08 +#define NAU8822_REG_JACK_DETECT_CONTROL_1 0x09 +#define NAU8822_REG_DAC_CONTROL 0x0A +#define NAU8822_REG_LEFT_DAC_DIGITAL_VOLUME 0x0B +#define NAU8822_REG_RIGHT_DAC_DIGITAL_VOLUME 0x0C +#define NAU8822_REG_JACK_DETECT_CONTROL_2 0x0D +#define NAU8822_REG_ADC_CONTROL 0x0E +#define NAU8822_REG_LEFT_ADC_DIGITAL_VOLUME 0x0F +#define NAU8822_REG_RIGHT_ADC_DIGITAL_VOLUME 0x10 +#define NAU8822_REG_EQ1 0x12 +#define NAU8822_REG_EQ2 0x13 +#define NAU8822_REG_EQ3 0x14 +#define NAU8822_REG_EQ4 0x15 +#define NAU8822_REG_EQ5 0x16 +#define NAU8822_REG_DAC_LIMITER_1 0x18 +#define NAU8822_REG_DAC_LIMITER_2 0x19 +#define NAU8822_REG_NOTCH_FILTER_1 0x1B +#define NAU8822_REG_NOTCH_FILTER_2 0x1C +#define NAU8822_REG_NOTCH_FILTER_3 0x1D +#define NAU8822_REG_NOTCH_FILTER_4 0x1E +#define NAU8822_REG_ALC_CONTROL_1 0x20 +#define NAU8822_REG_ALC_CONTROL_2 0x21 +#define NAU8822_REG_ALC_CONTROL_3 0x22 +#define NAU8822_REG_NOISE_GATE 0x23 +#define NAU8822_REG_PLL_N 0x24 +#define NAU8822_REG_PLL_K1 0x25 +#define NAU8822_REG_PLL_K2 0x26 +#define NAU8822_REG_PLL_K3 0x27 +#define NAU8822_REG_3D_CONTROL 0x29 +#define NAU8822_REG_RIGHT_SPEAKER_CONTROL 0x2B +#define NAU8822_REG_INPUT_CONTROL 0x2C +#define NAU8822_REG_LEFT_INP_PGA_CONTROL 0x2D +#define NAU8822_REG_RIGHT_INP_PGA_CONTROL 0x2E +#define NAU8822_REG_LEFT_ADC_BOOST_CONTROL 0x2F +#define NAU8822_REG_RIGHT_ADC_BOOST_CONTROL 0x30 +#define NAU8822_REG_OUTPUT_CONTROL 0x31 +#define NAU8822_REG_LEFT_MIXER_CONTROL 0x32 +#define NAU8822_REG_RIGHT_MIXER_CONTROL 0x33 +#define NAU8822_REG_LHP_VOLUME 0x34 +#define NAU8822_REG_RHP_VOLUME 0x35 +#define NAU8822_REG_LSPKOUT_VOLUME 0x36 +#define NAU8822_REG_RSPKOUT_VOLUME 0x37 +#define NAU8822_REG_AUX2_MIXER 0x38 +#define NAU8822_REG_AUX1_MIXER 0x39 +#define NAU8822_REG_POWER_MANAGEMENT_4 0x3A +#define NAU8822_REG_LEFT_TIME_SLOT 0x3B +#define NAU8822_REG_MISC 0x3C +#define NAU8822_REG_RIGHT_TIME_SLOT 0x3D +#define NAU8822_REG_DEVICE_REVISION 0x3E +#define NAU8822_REG_DEVICE_ID 0x3F +#define NAU8822_REG_DAC_DITHER 0x41 +#define NAU8822_REG_ALC_ENHANCE_1 0x46 +#define NAU8822_REG_ALC_ENHANCE_2 0x47 +#define NAU8822_REG_192KHZ_SAMPLING 0x48 +#define NAU8822_REG_MISC_CONTROL 0x49 +#define NAU8822_REG_INPUT_TIEOFF 0x4A +#define NAU8822_REG_POWER_REDUCTION 0x4B +#define NAU8822_REG_AGC_PEAK2PEAK 0x4C +#define NAU8822_REG_AGC_PEAK_DETECT 0x4D +#define NAU8822_REG_AUTOMUTE_CONTROL 0x4E +#define NAU8822_REG_OUTPUT_TIEOFF 0x4F +#define NAU8822_REG_MAX_REGISTER NAU8822_REG_OUTPUT_TIEOFF + +/* NAU8822_REG_POWER_MANAGEMENT_1 (0x1) */ +#define NAU8822_REFIMP_MASK 0x3 +#define NAU8822_REFIMP_80K 0x1 +#define NAU8822_REFIMP_300K 0x2 +#define NAU8822_REFIMP_3K 0x3 +#define NAU8822_IOBUF_EN (0x1 << 2) +#define NAU8822_ABIAS_EN (0x1 << 3) + +/* NAU8822_REG_AUDIO_INTERFACE (0x4) */ +#define NAU8822_AIFMT_MASK (0x3 << 3) +#define NAU8822_WLEN_MASK (0x3 << 5) +#define NAU8822_WLEN_20 (0x1 << 5) +#define NAU8822_WLEN_24 (0x2 << 5) +#define NAU8822_WLEN_32 (0x3 << 5) +#define NAU8822_LRP_MASK (0x1 << 7) +#define NAU8822_BCLKP_MASK (0x1 << 8) + +/* NAU8822_REG_COMPANDING_CONTROL (0x5) */ +#define NAU8822_ADDAP_SFT 0 +#define NAU8822_ADCCM_SFT 1 +#define NAU8822_DACCM_SFT 3 + +/* NAU8822_REG_CLOCKING (0x6) */ +#define NAU8822_CLKIOEN_MASK 0x1 +#define NAU8822_MCLKSEL_SFT 5 +#define NAU8822_MCLKSEL_MASK (0x7 << 5) +#define NAU8822_BCLKSEL_SFT 2 +#define NAU8822_BCLKSEL_MASK (0x7 << 2) +#define NAU8822_CLKM_MASK (0x1 << 8) +#define NAU8822_CLKM_MCLK (0x0 << 8) +#define NAU8822_CLKM_PLL (0x1 << 8) + +/* NAU8822_REG_ADDITIONAL_CONTROL (0x08) */ +#define NAU8822_SMPLR_SFT 1 +#define NAU8822_SMPLR_MASK (0x7 << 1) +#define NAU8822_SMPLR_48K (0x0 << 1) +#define NAU8822_SMPLR_32K (0x1 << 1) +#define NAU8822_SMPLR_24K (0x2 << 1) +#define NAU8822_SMPLR_16K (0x3 << 1) +#define NAU8822_SMPLR_12K (0x4 << 1) +#define NAU8822_SMPLR_8K (0x5 << 1) + +/* NAU8822_REG_EQ1 (0x12) */ +#define NAU8822_EQ1GC_SFT 0 +#define NAU8822_EQ1CF_SFT 5 +#define NAU8822_EQM_SFT 8 + +/* NAU8822_REG_EQ2 (0x13) */ +#define NAU8822_EQ2GC_SFT 0 +#define NAU8822_EQ2CF_SFT 5 +#define NAU8822_EQ2BW_SFT 8 + +/* NAU8822_REG_EQ3 (0x14) */ +#define NAU8822_EQ3GC_SFT 0 +#define NAU8822_EQ3CF_SFT 5 +#define NAU8822_EQ3BW_SFT 8 + +/* NAU8822_REG_EQ4 (0x15) */ +#define NAU8822_EQ4GC_SFT 0 +#define NAU8822_EQ4CF_SFT 5 +#define NAU8822_EQ4BW_SFT 8 + +/* NAU8822_REG_EQ5 (0x16) */ +#define NAU8822_EQ5GC_SFT 0 +#define NAU8822_EQ5CF_SFT 5 + +/* NAU8822_REG_ALC_CONTROL_1 (0x20) */ +#define NAU8822_ALCMINGAIN_SFT 0 +#define NAU8822_ALCMXGAIN_SFT 3 +#define NAU8822_ALCEN_SFT 7 + +/* NAU8822_REG_ALC_CONTROL_2 (0x21) */ +#define NAU8822_ALCSL_SFT 0 +#define NAU8822_ALCHT_SFT 4 + +/* NAU8822_REG_ALC_CONTROL_3 (0x22) */ +#define NAU8822_ALCATK_SFT 0 +#define NAU8822_ALCDCY_SFT 4 +#define NAU8822_ALCM_SFT 8 + +/* NAU8822_REG_PLL_N (0x24) */ +#define NAU8822_PLLMCLK_DIV2 (0x1 << 4) +#define NAU8822_PLLN_MASK 0xF + +#define NAU8822_PLLK1_SFT 18 +#define NAU8822_PLLK1_MASK 0x3F + +/* NAU8822_REG_PLL_K2 (0x26) */ +#define NAU8822_PLLK2_SFT 9 +#define NAU8822_PLLK2_MASK 0x1FF + +/* NAU8822_REG_PLL_K3 (0x27) */ +#define NAU8822_PLLK3_MASK 0x1FF + +/* System Clock Source */ +enum { + NAU8822_CLK_MCLK, + NAU8822_CLK_PLL, +}; + +struct nau8822_pll { + int pre_factor; + int mclk_scaler; + int pll_frac; + int pll_int; +}; + +/* Codec Private Data */ +struct nau8822 { + struct device *dev; + struct regmap *regmap; + int mclk_idx; + struct nau8822_pll pll; + int sysclk; + int div_id; +}; + +#endif /* __NAU8822_H__ */ diff --git a/sound/soc/codecs/pcm186x.c b/sound/soc/codecs/pcm186x.c index 690c26e7389e..809b7e9f03ca 100644 --- a/sound/soc/codecs/pcm186x.c +++ b/sound/soc/codecs/pcm186x.c @@ -401,7 +401,8 @@ static int pcm186x_set_fmt(struct snd_soc_dai *dai, unsigned int format) break; case SND_SOC_DAIFMT_DSP_A: priv->tdm_offset += 1; - /* Fall through... DSP_A uses the same basic config as DSP_B + /* fall through */ + /* DSP_A uses the same basic config as DSP_B * except we need to shift the TDM output by one BCK cycle */ case SND_SOC_DAIFMT_DSP_B: diff --git a/sound/soc/codecs/pcm3060-i2c.c b/sound/soc/codecs/pcm3060-i2c.c new file mode 100644 index 000000000000..cdc8314882bc --- /dev/null +++ b/sound/soc/codecs/pcm3060-i2c.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// PCM3060 I2C driver +// +// Copyright (C) 2018 Kirill Marinushkin + +#include +#include +#include + +#include "pcm3060.h" + +static int pcm3060_i2c_probe(struct i2c_client *i2c, + const struct i2c_device_id *id) +{ + struct pcm3060_priv *priv; + + priv = devm_kzalloc(&i2c->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + i2c_set_clientdata(i2c, priv); + + priv->regmap = devm_regmap_init_i2c(i2c, &pcm3060_regmap); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); + + return pcm3060_probe(&i2c->dev); +} + +static const struct i2c_device_id pcm3060_i2c_id[] = { + { .name = "pcm3060" }, + { }, +}; +MODULE_DEVICE_TABLE(i2c, pcm3060_i2c_id); + +#ifdef CONFIG_OF +static const struct of_device_id pcm3060_of_match[] = { + { .compatible = "ti,pcm3060" }, + { }, +}; +MODULE_DEVICE_TABLE(of, pcm3060_of_match); +#endif /* CONFIG_OF */ + +static struct i2c_driver pcm3060_i2c_driver = { + .driver = { + .name = "pcm3060", +#ifdef CONFIG_OF + .of_match_table = pcm3060_of_match, +#endif /* CONFIG_OF */ + }, + .id_table = pcm3060_i2c_id, + .probe = pcm3060_i2c_probe, +}; + +module_i2c_driver(pcm3060_i2c_driver); + +MODULE_DESCRIPTION("PCM3060 I2C driver"); +MODULE_AUTHOR("Kirill Marinushkin "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/codecs/pcm3060-spi.c b/sound/soc/codecs/pcm3060-spi.c new file mode 100644 index 000000000000..f6f19fa80932 --- /dev/null +++ b/sound/soc/codecs/pcm3060-spi.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// PCM3060 SPI driver +// +// Copyright (C) 2018 Kirill Marinushkin + +#include +#include +#include + +#include "pcm3060.h" + +static int pcm3060_spi_probe(struct spi_device *spi) +{ + struct pcm3060_priv *priv; + + priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + spi_set_drvdata(spi, priv); + + priv->regmap = devm_regmap_init_spi(spi, &pcm3060_regmap); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); + + return pcm3060_probe(&spi->dev); +} + +static const struct spi_device_id pcm3060_spi_id[] = { + { .name = "pcm3060" }, + { }, +}; +MODULE_DEVICE_TABLE(spi, pcm3060_spi_id); + +#ifdef CONFIG_OF +static const struct of_device_id pcm3060_of_match[] = { + { .compatible = "ti,pcm3060" }, + { }, +}; +MODULE_DEVICE_TABLE(of, pcm3060_of_match); +#endif /* CONFIG_OF */ + +static struct spi_driver pcm3060_spi_driver = { + .driver = { + .name = "pcm3060", +#ifdef CONFIG_OF + .of_match_table = pcm3060_of_match, +#endif /* CONFIG_OF */ + }, + .id_table = pcm3060_spi_id, + .probe = pcm3060_spi_probe, +}; + +module_spi_driver(pcm3060_spi_driver); + +MODULE_DESCRIPTION("PCM3060 SPI driver"); +MODULE_AUTHOR("Kirill Marinushkin "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/codecs/pcm3060.c b/sound/soc/codecs/pcm3060.c new file mode 100644 index 000000000000..494d9d662be8 --- /dev/null +++ b/sound/soc/codecs/pcm3060.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// PCM3060 codec driver +// +// Copyright (C) 2018 Kirill Marinushkin + +#include +#include +#include +#include + +#include "pcm3060.h" + +/* dai */ + +static int pcm3060_set_sysclk(struct snd_soc_dai *dai, int clk_id, + unsigned int freq, int dir) +{ + struct snd_soc_component *comp = dai->component; + struct pcm3060_priv *priv = snd_soc_component_get_drvdata(comp); + + if (dir != SND_SOC_CLOCK_IN) { + dev_err(comp->dev, "unsupported sysclock dir: %d\n", dir); + return -EINVAL; + } + + priv->dai[dai->id].sclk_freq = freq; + + return 0; +} + +static int pcm3060_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) +{ + struct snd_soc_component *comp = dai->component; + struct pcm3060_priv *priv = snd_soc_component_get_drvdata(comp); + unsigned int reg; + unsigned int val; + + if ((fmt & SND_SOC_DAIFMT_INV_MASK) != SND_SOC_DAIFMT_NB_NF) { + dev_err(comp->dev, "unsupported DAI polarity: 0x%x\n", fmt); + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { + case SND_SOC_DAIFMT_CBM_CFM: + priv->dai[dai->id].is_master = true; + break; + case SND_SOC_DAIFMT_CBS_CFS: + priv->dai[dai->id].is_master = false; + break; + default: + dev_err(comp->dev, "unsupported DAI master mode: 0x%x\n", fmt); + return -EINVAL; + } + + switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_I2S: + val = PCM3060_REG_FMT_I2S; + break; + case SND_SOC_DAIFMT_RIGHT_J: + val = PCM3060_REG_FMT_RJ; + break; + case SND_SOC_DAIFMT_LEFT_J: + val = PCM3060_REG_FMT_LJ; + break; + default: + dev_err(comp->dev, "unsupported DAI format: 0x%x\n", fmt); + return -EINVAL; + } + + if (dai->id == PCM3060_DAI_ID_DAC) + reg = PCM3060_REG67; + else + reg = PCM3060_REG72; + + regmap_update_bits(priv->regmap, reg, PCM3060_REG_MASK_FMT, val); + + return 0; +} + +static int pcm3060_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *comp = dai->component; + struct pcm3060_priv *priv = snd_soc_component_get_drvdata(comp); + unsigned int rate; + unsigned int ratio; + unsigned int reg; + unsigned int val; + + if (!priv->dai[dai->id].is_master) { + val = PCM3060_REG_MS_S; + goto val_ready; + } + + rate = params_rate(params); + if (!rate) { + dev_err(comp->dev, "rate is not configured\n"); + return -EINVAL; + } + + ratio = priv->dai[dai->id].sclk_freq / rate; + + switch (ratio) { + case 768: + val = PCM3060_REG_MS_M768; + break; + case 512: + val = PCM3060_REG_MS_M512; + break; + case 384: + val = PCM3060_REG_MS_M384; + break; + case 256: + val = PCM3060_REG_MS_M256; + break; + case 192: + val = PCM3060_REG_MS_M192; + break; + case 128: + val = PCM3060_REG_MS_M128; + break; + default: + dev_err(comp->dev, "unsupported ratio: %d\n", ratio); + return -EINVAL; + } + +val_ready: + if (dai->id == PCM3060_DAI_ID_DAC) + reg = PCM3060_REG67; + else + reg = PCM3060_REG72; + + regmap_update_bits(priv->regmap, reg, PCM3060_REG_MASK_MS, val); + + return 0; +} + +static const struct snd_soc_dai_ops pcm3060_dai_ops = { + .set_sysclk = pcm3060_set_sysclk, + .set_fmt = pcm3060_set_fmt, + .hw_params = pcm3060_hw_params, +}; + +#define PCM3060_DAI_RATES_ADC (SNDRV_PCM_RATE_16000 | SNDRV_PCM_RATE_32000 | \ + SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000 | \ + SNDRV_PCM_RATE_88200 | SNDRV_PCM_RATE_96000) + +#define PCM3060_DAI_RATES_DAC (PCM3060_DAI_RATES_ADC | \ + SNDRV_PCM_RATE_176400 | SNDRV_PCM_RATE_192000) + +static struct snd_soc_dai_driver pcm3060_dai[] = { + { + .name = "pcm3060-dac", + .id = PCM3060_DAI_ID_DAC, + .playback = { + .stream_name = "Playback", + .channels_min = 2, + .channels_max = 2, + .rates = PCM3060_DAI_RATES_DAC, + .formats = SNDRV_PCM_FMTBIT_S24_LE, + }, + .ops = &pcm3060_dai_ops, + }, + { + .name = "pcm3060-adc", + .id = PCM3060_DAI_ID_ADC, + .capture = { + .stream_name = "Capture", + .channels_min = 2, + .channels_max = 2, + .rates = PCM3060_DAI_RATES_ADC, + .formats = SNDRV_PCM_FMTBIT_S24_LE, + }, + .ops = &pcm3060_dai_ops, + }, +}; + +/* dapm */ + +static DECLARE_TLV_DB_SCALE(pcm3060_dapm_tlv, -10050, 50, 1); + +static const struct snd_kcontrol_new pcm3060_dapm_controls[] = { + SOC_DOUBLE_R_RANGE_TLV("Master Playback Volume", + PCM3060_REG65, PCM3060_REG66, 0, + PCM3060_REG_AT2_MIN, PCM3060_REG_AT2_MAX, + 0, pcm3060_dapm_tlv), + SOC_DOUBLE("Master Playback Switch", PCM3060_REG68, + PCM3060_REG_SHIFT_MUT21, PCM3060_REG_SHIFT_MUT22, 1, 1), + + SOC_DOUBLE_R_RANGE_TLV("Master Capture Volume", + PCM3060_REG70, PCM3060_REG71, 0, + PCM3060_REG_AT1_MIN, PCM3060_REG_AT1_MAX, + 0, pcm3060_dapm_tlv), + SOC_DOUBLE("Master Capture Switch", PCM3060_REG73, + PCM3060_REG_SHIFT_MUT11, PCM3060_REG_SHIFT_MUT12, 1, 1), +}; + +static const struct snd_soc_dapm_widget pcm3060_dapm_widgets[] = { + SND_SOC_DAPM_OUTPUT("OUTL+"), + SND_SOC_DAPM_OUTPUT("OUTR+"), + SND_SOC_DAPM_OUTPUT("OUTL-"), + SND_SOC_DAPM_OUTPUT("OUTR-"), + + SND_SOC_DAPM_INPUT("INL"), + SND_SOC_DAPM_INPUT("INR"), +}; + +static const struct snd_soc_dapm_route pcm3060_dapm_map[] = { + { "OUTL+", NULL, "Playback" }, + { "OUTR+", NULL, "Playback" }, + { "OUTL-", NULL, "Playback" }, + { "OUTR-", NULL, "Playback" }, + + { "Capture", NULL, "INL" }, + { "Capture", NULL, "INR" }, +}; + +/* soc component */ + +static const struct snd_soc_component_driver pcm3060_soc_comp_driver = { + .controls = pcm3060_dapm_controls, + .num_controls = ARRAY_SIZE(pcm3060_dapm_controls), + .dapm_widgets = pcm3060_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(pcm3060_dapm_widgets), + .dapm_routes = pcm3060_dapm_map, + .num_dapm_routes = ARRAY_SIZE(pcm3060_dapm_map), +}; + +/* regmap */ + +static bool pcm3060_reg_writeable(struct device *dev, unsigned int reg) +{ + return (reg >= PCM3060_REG64); +} + +static bool pcm3060_reg_readable(struct device *dev, unsigned int reg) +{ + return (reg >= PCM3060_REG64); +} + +static bool pcm3060_reg_volatile(struct device *dev, unsigned int reg) +{ + /* PCM3060_REG64 is volatile */ + return (reg == PCM3060_REG64); +} + +static const struct reg_default pcm3060_reg_defaults[] = { + { PCM3060_REG64, 0xF0 }, + { PCM3060_REG65, 0xFF }, + { PCM3060_REG66, 0xFF }, + { PCM3060_REG67, 0x00 }, + { PCM3060_REG68, 0x00 }, + { PCM3060_REG69, 0x00 }, + { PCM3060_REG70, 0xD7 }, + { PCM3060_REG71, 0xD7 }, + { PCM3060_REG72, 0x00 }, + { PCM3060_REG73, 0x00 }, +}; + +const struct regmap_config pcm3060_regmap = { + .reg_bits = 8, + .val_bits = 8, + .writeable_reg = pcm3060_reg_writeable, + .readable_reg = pcm3060_reg_readable, + .volatile_reg = pcm3060_reg_volatile, + .max_register = PCM3060_REG73, + .reg_defaults = pcm3060_reg_defaults, + .num_reg_defaults = ARRAY_SIZE(pcm3060_reg_defaults), + .cache_type = REGCACHE_RBTREE, +}; +EXPORT_SYMBOL(pcm3060_regmap); + +/* device */ + +int pcm3060_probe(struct device *dev) +{ + int rc; + + rc = devm_snd_soc_register_component(dev, &pcm3060_soc_comp_driver, + pcm3060_dai, + ARRAY_SIZE(pcm3060_dai)); + if (rc) { + dev_err(dev, "failed to register component, rc=%d\n", rc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL(pcm3060_probe); + +MODULE_DESCRIPTION("PCM3060 codec driver"); +MODULE_AUTHOR("Kirill Marinushkin "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/codecs/pcm3060.h b/sound/soc/codecs/pcm3060.h new file mode 100644 index 000000000000..fd89a68aa8a7 --- /dev/null +++ b/sound/soc/codecs/pcm3060.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PCM3060 codec driver + * + * Copyright (C) 2018 Kirill Marinushkin + */ + +#ifndef _SND_SOC_PCM3060_H +#define _SND_SOC_PCM3060_H + +#include +#include + +extern const struct regmap_config pcm3060_regmap; + +#define PCM3060_DAI_ID_DAC 0 +#define PCM3060_DAI_ID_ADC 1 +#define PCM3060_DAI_IDS_NUM 2 + +struct pcm3060_priv_dai { + bool is_master; + unsigned int sclk_freq; +}; + +struct pcm3060_priv { + struct regmap *regmap; + struct pcm3060_priv_dai dai[PCM3060_DAI_IDS_NUM]; +}; + +int pcm3060_probe(struct device *dev); +int pcm3060_remove(struct device *dev); + +/* registers */ + +#define PCM3060_REG64 0x40 +#define PCM3060_REG_MRST 0x80 +#define PCM3060_REG_SRST 0x40 +#define PCM3060_REG_ADPSV 0x20 +#define PCM3060_REG_DAPSV 0x10 +#define PCM3060_REG_SE 0x01 + +#define PCM3060_REG65 0x41 +#define PCM3060_REG66 0x42 +#define PCM3060_REG_AT2_MIN 0x36 +#define PCM3060_REG_AT2_MAX 0xFF + +#define PCM3060_REG67 0x43 +#define PCM3060_REG72 0x48 +#define PCM3060_REG_CSEL 0x80 +#define PCM3060_REG_MASK_MS 0x70 +#define PCM3060_REG_MS_S 0x00 +#define PCM3060_REG_MS_M768 (0x01 << 4) +#define PCM3060_REG_MS_M512 (0x02 << 4) +#define PCM3060_REG_MS_M384 (0x03 << 4) +#define PCM3060_REG_MS_M256 (0x04 << 4) +#define PCM3060_REG_MS_M192 (0x05 << 4) +#define PCM3060_REG_MS_M128 (0x06 << 4) +#define PCM3060_REG_MASK_FMT 0x03 +#define PCM3060_REG_FMT_I2S 0x00 +#define PCM3060_REG_FMT_LJ 0x01 +#define PCM3060_REG_FMT_RJ 0x02 + +#define PCM3060_REG68 0x44 +#define PCM3060_REG_OVER 0x40 +#define PCM3060_REG_DREV2 0x04 +#define PCM3060_REG_SHIFT_MUT21 0x00 +#define PCM3060_REG_SHIFT_MUT22 0x01 + +#define PCM3060_REG69 0x45 +#define PCM3060_REG_FLT 0x80 +#define PCM3060_REG_MASK_DMF 0x60 +#define PCM3060_REG_DMC 0x10 +#define PCM3060_REG_ZREV 0x02 +#define PCM3060_REG_AZRO 0x01 + +#define PCM3060_REG70 0x46 +#define PCM3060_REG71 0x47 +#define PCM3060_REG_AT1_MIN 0x0E +#define PCM3060_REG_AT1_MAX 0xFF + +#define PCM3060_REG73 0x49 +#define PCM3060_REG_ZCDD 0x10 +#define PCM3060_REG_BYP 0x08 +#define PCM3060_REG_DREV1 0x04 +#define PCM3060_REG_SHIFT_MUT11 0x00 +#define PCM3060_REG_SHIFT_MUT12 0x01 + +#endif /* _SND_SOC_PCM3060_H */ diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 3356c91f55b0..52cc950c9fd1 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -33,6 +33,8 @@ #define PCM3168A_FMT_RIGHT_J_16 0x3 #define PCM3168A_FMT_DSP_A 0x4 #define PCM3168A_FMT_DSP_B 0x5 +#define PCM3168A_FMT_I2S_TDM 0x6 +#define PCM3168A_FMT_LEFT_J_TDM 0x7 #define PCM3168A_FMT_DSP_MASK 0x4 #define PCM3168A_NUM_SUPPLIES 6 @@ -401,9 +403,11 @@ static int pcm3168a_hw_params(struct snd_pcm_substream *substream, bool tx, master_mode; u32 val, mask, shift, reg; unsigned int rate, fmt, ratio, max_ratio; + unsigned int chan; int i, min_frame_size; rate = params_rate(params); + chan = params_channels(params); ratio = pcm3168a->sysclk / rate; @@ -456,6 +460,21 @@ static int pcm3168a_hw_params(struct snd_pcm_substream *substream, return -EINVAL; } + /* for TDM */ + if (chan > 2) { + switch (fmt) { + case PCM3168A_FMT_I2S: + fmt = PCM3168A_FMT_I2S_TDM; + break; + case PCM3168A_FMT_LEFT_J: + fmt = PCM3168A_FMT_LEFT_J_TDM; + break; + default: + dev_err(component->dev, "TDM is supported under I2S/Left_J only\n"); + return -EINVAL; + } + } + if (master_mode) val = ((i + 1) << shift); else @@ -476,7 +495,69 @@ static int pcm3168a_hw_params(struct snd_pcm_substream *substream, return 0; } +static int pcm3168a_startup(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct pcm3168a_priv *pcm3168a = snd_soc_component_get_drvdata(component); + bool tx = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; + unsigned int fmt; + unsigned int sample_min; + unsigned int channel_max; + + if (tx) + fmt = pcm3168a->dac_fmt; + else + fmt = pcm3168a->adc_fmt; + + /* + * Available Data Bits + * + * RIGHT_J : 24 / 16 + * LEFT_J : 24 + * I2S : 24 + * + * TDM available + * + * I2S + * LEFT_J + */ + switch (fmt) { + case PCM3168A_FMT_RIGHT_J: + sample_min = 16; + channel_max = 2; + break; + case PCM3168A_FMT_LEFT_J: + sample_min = 24; + if (tx) + channel_max = 8; + else + channel_max = 6; + break; + case PCM3168A_FMT_I2S: + sample_min = 24; + if (tx) + channel_max = 8; + else + channel_max = 6; + break; + default: + sample_min = 24; + channel_max = 2; + } + + snd_pcm_hw_constraint_minmax(substream->runtime, + SNDRV_PCM_HW_PARAM_SAMPLE_BITS, + sample_min, 32); + + snd_pcm_hw_constraint_minmax(substream->runtime, + SNDRV_PCM_HW_PARAM_CHANNELS, + 2, channel_max); + + return 0; +} static const struct snd_soc_dai_ops pcm3168a_dac_dai_ops = { + .startup = pcm3168a_startup, .set_fmt = pcm3168a_set_dai_fmt_dac, .set_sysclk = pcm3168a_set_dai_sysclk, .hw_params = pcm3168a_hw_params, @@ -484,6 +565,7 @@ static const struct snd_soc_dai_ops pcm3168a_dac_dai_ops = { }; static const struct snd_soc_dai_ops pcm3168a_adc_dai_ops = { + .startup = pcm3168a_startup, .set_fmt = pcm3168a_set_dai_fmt_adc, .set_sysclk = pcm3168a_set_dai_sysclk, .hw_params = pcm3168a_hw_params diff --git a/sound/soc/codecs/rt1305.c b/sound/soc/codecs/rt1305.c index c4452efc7970..c2c8a68cec97 100644 --- a/sound/soc/codecs/rt1305.c +++ b/sound/soc/codecs/rt1305.c @@ -963,7 +963,8 @@ static const struct regmap_config rt1305_regmap = { .num_reg_defaults = ARRAY_SIZE(rt1305_reg), .ranges = rt1305_ranges, .num_ranges = ARRAY_SIZE(rt1305_ranges), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; #if defined(CONFIG_OF) diff --git a/sound/soc/codecs/rt274.c b/sound/soc/codecs/rt274.c index d88e67341083..0ef966d56bac 100644 --- a/sound/soc/codecs/rt274.c +++ b/sound/soc/codecs/rt274.c @@ -755,6 +755,7 @@ static int rt274_set_dai_pll(struct snd_soc_dai *dai, int pll_id, int source, break; default: dev_warn(component->dev, "invalid pll source, use BCLK\n"); + /* fall through */ case RT274_PLL2_S_BCLK: snd_soc_component_update_bits(component, RT274_PLL2_CTRL, RT274_PLL2_SRC_MASK, RT274_PLL2_SRC_BCLK); @@ -782,6 +783,7 @@ static int rt274_set_dai_pll(struct snd_soc_dai *dai, int pll_id, int source, break; default: dev_warn(component->dev, "invalid freq_in, assume 4.8M\n"); + /* fall through */ case 100: snd_soc_component_write(component, 0x7a, 0xaab6); snd_soc_component_write(component, 0x7b, 0x0301); diff --git a/sound/soc/codecs/rt5514-spi.c b/sound/soc/codecs/rt5514-spi.c index 6478d10c4f4a..4d46f4567c3a 100644 --- a/sound/soc/codecs/rt5514-spi.c +++ b/sound/soc/codecs/rt5514-spi.c @@ -91,6 +91,14 @@ static void rt5514_spi_copy_work(struct work_struct *work) runtime = rt5514_dsp->substream->runtime; period_bytes = snd_pcm_lib_period_bytes(rt5514_dsp->substream); + if (!period_bytes) { + schedule_delayed_work(&rt5514_dsp->copy_work, 5); + goto done; + } + + if (rt5514_dsp->buf_size % period_bytes) + rt5514_dsp->buf_size = (rt5514_dsp->buf_size / period_bytes) * + period_bytes; if (rt5514_dsp->get_size >= rt5514_dsp->buf_size) { rt5514_spi_burst_read(RT5514_BUFFER_VOICE_WP, (u8 *)&buf, @@ -149,13 +157,11 @@ done: static void rt5514_schedule_copy(struct rt5514_dsp *rt5514_dsp) { - size_t period_bytes; u8 buf[8]; if (!rt5514_dsp->substream) return; - period_bytes = snd_pcm_lib_period_bytes(rt5514_dsp->substream); rt5514_dsp->get_size = 0; /** @@ -183,10 +189,6 @@ static void rt5514_schedule_copy(struct rt5514_dsp *rt5514_dsp) rt5514_dsp->buf_size = rt5514_dsp->buf_limit - rt5514_dsp->buf_base; - if (rt5514_dsp->buf_size % period_bytes) - rt5514_dsp->buf_size = (rt5514_dsp->buf_size / period_bytes) * - period_bytes; - if (rt5514_dsp->buf_base && rt5514_dsp->buf_limit && rt5514_dsp->buf_rp && rt5514_dsp->buf_size) schedule_delayed_work(&rt5514_dsp->copy_work, 0); diff --git a/sound/soc/codecs/rt5514.c b/sound/soc/codecs/rt5514.c index 32fe76c3134a..a67de68b6da6 100644 --- a/sound/soc/codecs/rt5514.c +++ b/sound/soc/codecs/rt5514.c @@ -1201,7 +1201,8 @@ static const struct regmap_config rt5514_regmap = { .cache_type = REGCACHE_RBTREE, .reg_defaults = rt5514_reg, .num_reg_defaults = ARRAY_SIZE(rt5514_reg), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static const struct i2c_device_id rt5514_i2c_id[] = { diff --git a/sound/soc/codecs/rt5616.c b/sound/soc/codecs/rt5616.c index 3dc795f444ce..36a9f1c56c8d 100644 --- a/sound/soc/codecs/rt5616.c +++ b/sound/soc/codecs/rt5616.c @@ -1313,7 +1313,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5616 = { static const struct regmap_config rt5616_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5616_DEVICE_ID + 1 + (ARRAY_SIZE(rt5616_ranges) * RT5616_PR_SPACING), .volatile_reg = rt5616_volatile_register, diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c index 27770143ae8f..fc530481a6e4 100644 --- a/sound/soc/codecs/rt5640.c +++ b/sound/soc/codecs/rt5640.c @@ -2704,7 +2704,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5640 = { static const struct regmap_config rt5640_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5640_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5640_ranges) * RT5640_PR_SPACING), diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c index 1dc70f452c1b..be674688dc40 100644 --- a/sound/soc/codecs/rt5645.c +++ b/sound/soc/codecs/rt5645.c @@ -3559,7 +3559,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5645 = { static const struct regmap_config rt5645_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5645_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5645_ranges) * RT5645_PR_SPACING), .volatile_reg = rt5645_volatile_register, @@ -3575,7 +3576,8 @@ static const struct regmap_config rt5645_regmap = { static const struct regmap_config rt5650_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5645_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5645_ranges) * RT5645_PR_SPACING), .volatile_reg = rt5645_volatile_register, @@ -3592,7 +3594,8 @@ static const struct regmap_config temp_regmap = { .name="nocache", .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5645_VENDOR_ID2 + 1, .cache_type = REGCACHE_NONE, }; diff --git a/sound/soc/codecs/rt5651.c b/sound/soc/codecs/rt5651.c index 985852fd9723..b7ba64350a07 100644 --- a/sound/soc/codecs/rt5651.c +++ b/sound/soc/codecs/rt5651.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -2124,7 +2123,8 @@ static const struct regmap_config rt5651_regmap = { .num_reg_defaults = ARRAY_SIZE(rt5651_reg), .ranges = rt5651_ranges, .num_ranges = ARRAY_SIZE(rt5651_ranges), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; #if defined(CONFIG_OF) diff --git a/sound/soc/codecs/rt5660.c b/sound/soc/codecs/rt5660.c index 20a755137e63..27f7445b2432 100644 --- a/sound/soc/codecs/rt5660.c +++ b/sound/soc/codecs/rt5660.c @@ -1217,7 +1217,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5660 = { static const struct regmap_config rt5660_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5660_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5660_ranges) * RT5660_PR_SPACING), diff --git a/sound/soc/codecs/rt5663.c b/sound/soc/codecs/rt5663.c index 9bd24ad42240..7eb2cbd39d6e 100644 --- a/sound/soc/codecs/rt5663.c +++ b/sound/soc/codecs/rt5663.c @@ -72,6 +72,7 @@ struct rt5663_priv { static const struct reg_sequence rt5663_patch_list[] = { { 0x002a, 0x8020 }, { 0x0086, 0x0028 }, + { 0x0100, 0xa020 }, { 0x0117, 0x0f28 }, { 0x02fb, 0x8089 }, }; @@ -580,7 +581,7 @@ static const struct reg_default rt5663_reg[] = { { 0x00fd, 0x0001 }, { 0x00fe, 0x10ec }, { 0x00ff, 0x6406 }, - { 0x0100, 0xa0a0 }, + { 0x0100, 0xa020 }, { 0x0108, 0x4444 }, { 0x0109, 0x4444 }, { 0x010a, 0xaaaa }, @@ -2337,6 +2338,8 @@ static int rt5663_hp_event(struct snd_soc_dapm_widget *w, 0x8000); snd_soc_component_update_bits(component, RT5663_DEPOP_1, 0x3000, 0x3000); + snd_soc_component_update_bits(component, + RT5663_DIG_VOL_ZCD, 0x00c0, 0x0080); } break; @@ -2351,6 +2354,8 @@ static int rt5663_hp_event(struct snd_soc_dapm_widget *w, RT5663_OVCD_HP_MASK, RT5663_OVCD_HP_EN); snd_soc_component_update_bits(component, RT5663_DACREF_LDO, 0x3e0e, 0); + snd_soc_component_update_bits(component, + RT5663_DIG_VOL_ZCD, 0x00c0, 0); } break; @@ -3252,7 +3257,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5663 = { static const struct regmap_config rt5663_v2_regmap = { .reg_bits = 16, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = 0x07fa, .volatile_reg = rt5663_v2_volatile_register, .readable_reg = rt5663_v2_readable_register, @@ -3264,7 +3270,8 @@ static const struct regmap_config rt5663_v2_regmap = { static const struct regmap_config rt5663_regmap = { .reg_bits = 16, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = 0x03f3, .volatile_reg = rt5663_volatile_register, .readable_reg = rt5663_readable_register, @@ -3277,7 +3284,8 @@ static const struct regmap_config temp_regmap = { .name = "nocache", .reg_bits = 16, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = 0x03f3, .cache_type = REGCACHE_NONE, }; diff --git a/sound/soc/codecs/rt5665.c b/sound/soc/codecs/rt5665.c index 6ba99f5ed3f4..f2ad3a4c3b7f 100644 --- a/sound/soc/codecs/rt5665.c +++ b/sound/soc/codecs/rt5665.c @@ -4633,7 +4633,8 @@ static const struct regmap_config rt5665_regmap = { .cache_type = REGCACHE_RBTREE, .reg_defaults = rt5665_reg, .num_reg_defaults = ARRAY_SIZE(rt5665_reg), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static const struct i2c_device_id rt5665_i2c_id[] = { diff --git a/sound/soc/codecs/rt5668.c b/sound/soc/codecs/rt5668.c index 3c19d03f2446..230a21c93b6b 100644 --- a/sound/soc/codecs/rt5668.c +++ b/sound/soc/codecs/rt5668.c @@ -2375,7 +2375,8 @@ static const struct regmap_config rt5668_regmap = { .cache_type = REGCACHE_RBTREE, .reg_defaults = rt5668_reg, .num_reg_defaults = ARRAY_SIZE(rt5668_reg), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static const struct i2c_device_id rt5668_i2c_id[] = { @@ -2587,17 +2588,10 @@ static int rt5668_i2c_probe(struct i2c_client *i2c, } - return snd_soc_register_component(&i2c->dev, &soc_component_dev_rt5668, + return devm_snd_soc_register_component(&i2c->dev, &soc_component_dev_rt5668, rt5668_dai, ARRAY_SIZE(rt5668_dai)); } -static int rt5668_i2c_remove(struct i2c_client *i2c) -{ - snd_soc_unregister_component(&i2c->dev); - - return 0; -} - static void rt5668_i2c_shutdown(struct i2c_client *client) { struct rt5668_priv *rt5668 = i2c_get_clientdata(client); @@ -2628,7 +2622,6 @@ static struct i2c_driver rt5668_i2c_driver = { .acpi_match_table = ACPI_PTR(rt5668_acpi_match), }, .probe = rt5668_i2c_probe, - .remove = rt5668_i2c_remove, .shutdown = rt5668_i2c_shutdown, .id_table = rt5668_i2c_id, }; diff --git a/sound/soc/codecs/rt5670.c b/sound/soc/codecs/rt5670.c index 732ef928b25d..453328c988c0 100644 --- a/sound/soc/codecs/rt5670.c +++ b/sound/soc/codecs/rt5670.c @@ -2814,7 +2814,8 @@ static const struct snd_soc_component_driver soc_component_dev_rt5670 = { static const struct regmap_config rt5670_regmap = { .reg_bits = 8, .val_bits = 16, - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, .max_register = RT5670_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5670_ranges) * RT5670_PR_SPACING), .volatile_reg = rt5670_volatile_register, @@ -2875,6 +2876,18 @@ static const struct dmi_system_id dmi_platform_intel_quirks[] = { RT5670_DEV_GPIO | RT5670_JD_MODE1), }, + { + .callback = rt5670_quirk_cb, + .ident = "Lenovo Thinkpad Tablet 8", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad 8"), + }, + .driver_data = (unsigned long *)(RT5670_DMIC_EN | + RT5670_DMIC2_INR | + RT5670_DEV_GPIO | + RT5670_JD_MODE1), + }, { .callback = rt5670_quirk_cb, .ident = "Lenovo Thinkpad Tablet 10", diff --git a/sound/soc/codecs/rt5677-spi.c b/sound/soc/codecs/rt5677-spi.c index bd51f3655ee3..84501c2020c7 100644 --- a/sound/soc/codecs/rt5677-spi.c +++ b/sound/soc/codecs/rt5677-spi.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index afe7d5b19313..34cfaf8f6f34 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -67,7 +67,8 @@ struct rt5682_priv { }; static const struct reg_sequence patch_list[] = { - {0x01c1, 0x1000}, + {RT5682_HP_IMP_SENS_CTRL_19, 0x1000}, + {RT5682_DAC_ADC_DIG_VOL1, 0xa020}, }; static const struct reg_default rt5682_reg[] = { @@ -749,7 +750,6 @@ static bool rt5682_readable_register(struct device *dev, unsigned int reg) } } -static const DECLARE_TLV_DB_SCALE(hp_vol_tlv, -2250, 150, 0); static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -6525, 75, 0); static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -1725, 75, 0); static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0); @@ -1108,10 +1108,6 @@ static void rt5682_jack_detect_handler(struct work_struct *work) } static const struct snd_kcontrol_new rt5682_snd_controls[] = { - /* Headphone Output Volume */ - SOC_DOUBLE_R_TLV("Headphone Playback Volume", RT5682_HPL_GAIN, - RT5682_HPR_GAIN, RT5682_G_HP_SFT, 15, 1, hp_vol_tlv), - /* DAC Digital Volume */ SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5682_DAC1_DIG_VOL, RT5682_L_VOL_SFT + 1, RT5682_R_VOL_SFT + 1, 86, 0, dac_vol_tlv), @@ -1437,6 +1433,28 @@ static const struct snd_kcontrol_new hpor_switch = SOC_DAPM_SINGLE_AUTODISABLE("Switch", RT5682_HP_CTRL_1, RT5682_R_MUTE_SFT, 1, 1); +static int rt5682_charge_pump_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = + snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, + RT5682_HP_CHARGE_PUMP_1, RT5682_PM_HP_MASK, RT5682_PM_HP_HV); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_update_bits(component, + RT5682_HP_CHARGE_PUMP_1, RT5682_PM_HP_MASK, RT5682_PM_HP_LV); + break; + default: + return 0; + } + + return 0; +} + static int rt5682_hp_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) { @@ -1449,10 +1467,10 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, RT5682_HP_LOGIC_CTRL_2, 0x0012); snd_soc_component_write(component, RT5682_HP_CTRL_2, 0x6000); - snd_soc_component_update_bits(component, RT5682_STO_NG2_CTRL_1, - RT5682_NG2_EN_MASK, RT5682_NG2_EN); snd_soc_component_update_bits(component, RT5682_DEPOP_1, 0x60, 0x60); + snd_soc_component_update_bits(component, + RT5682_DAC_ADC_DIG_VOL1, 0x00c0, 0x0080); break; case SND_SOC_DAPM_POST_PMD: @@ -1460,6 +1478,8 @@ static int rt5682_hp_event(struct snd_soc_dapm_widget *w, RT5682_DEPOP_1, 0x60, 0x0); snd_soc_component_write(component, RT5682_HP_CTRL_2, 0x0000); + snd_soc_component_update_bits(component, + RT5682_DAC_ADC_DIG_VOL1, 0x00c0, 0x0000); break; default: @@ -1723,7 +1743,8 @@ static const struct snd_soc_dapm_widget rt5682_dapm_widgets[] = { SND_SOC_DAPM_SUPPLY("HP Amp R", RT5682_PWR_ANLG_1, RT5682_PWR_HA_R_BIT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY_S("Charge Pump", 1, RT5682_DEPOP_1, - RT5682_PUMP_EN_SFT, 0, NULL, 0), + RT5682_PUMP_EN_SFT, 0, rt5682_charge_pump_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), SND_SOC_DAPM_SUPPLY_S("Capless", 2, RT5682_DEPOP_1, RT5682_CAPLESS_EN_SFT, 0, NULL, 0), @@ -1884,6 +1905,7 @@ static const struct snd_soc_dapm_route rt5682_dapm_routes[] = { {"HP Amp", NULL, "Charge Pump"}, {"HP Amp", NULL, "CLKDET SYS"}, {"HP Amp", NULL, "CBJ Power"}, + {"HP Amp", NULL, "Vref1"}, {"HP Amp", NULL, "Vref2"}, {"HPOL Playback", "Switch", "HP Amp"}, {"HPOR Playback", "Switch", "HP Amp"}, @@ -2419,7 +2441,8 @@ static const struct regmap_config rt5682_regmap = { .cache_type = REGCACHE_RBTREE, .reg_defaults = rt5682_reg, .num_reg_defaults = ARRAY_SIZE(rt5682_reg), - .use_single_rw = true, + .use_single_read = true, + .use_single_write = true, }; static const struct i2c_device_id rt5682_i2c_id[] = { @@ -2451,30 +2474,23 @@ static void rt5682_calibrate(struct rt5682_priv *rt5682) mutex_lock(&rt5682->calibrate_mutex); rt5682_reset(rt5682->regmap); - regmap_write(rt5682->regmap, RT5682_PWR_ANLG_1, 0xa2bf); + regmap_write(rt5682->regmap, RT5682_PWR_ANLG_1, 0xa2af); usleep_range(15000, 20000); - regmap_write(rt5682->regmap, RT5682_PWR_ANLG_1, 0xf2bf); - regmap_write(rt5682->regmap, RT5682_MICBIAS_2, 0x0380); - regmap_write(rt5682->regmap, RT5682_PWR_DIG_1, 0x8001); - regmap_write(rt5682->regmap, RT5682_TEST_MODE_CTRL_1, 0x0000); - regmap_write(rt5682->regmap, RT5682_STO1_DAC_MIXER, 0x2080); - regmap_write(rt5682->regmap, RT5682_STO1_ADC_MIXER, 0x4040); - regmap_write(rt5682->regmap, RT5682_DEPOP_1, 0x0069); + regmap_write(rt5682->regmap, RT5682_PWR_ANLG_1, 0xf2af); + regmap_write(rt5682->regmap, RT5682_MICBIAS_2, 0x0300); + regmap_write(rt5682->regmap, RT5682_GLB_CLK, 0x8000); + regmap_write(rt5682->regmap, RT5682_PWR_DIG_1, 0x0100); + regmap_write(rt5682->regmap, RT5682_HP_IMP_SENS_CTRL_19, 0x3800); regmap_write(rt5682->regmap, RT5682_CHOP_DAC, 0x3000); - regmap_write(rt5682->regmap, RT5682_HP_CTRL_2, 0x6000); - regmap_write(rt5682->regmap, RT5682_HP_CHARGE_PUMP_1, 0x0f26); - regmap_write(rt5682->regmap, RT5682_CALIB_ADC_CTRL, 0x7f05); + regmap_write(rt5682->regmap, RT5682_CALIB_ADC_CTRL, 0x7005); regmap_write(rt5682->regmap, RT5682_STO1_ADC_MIXER, 0x686c); regmap_write(rt5682->regmap, RT5682_CAL_REC, 0x0d0d); - regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_9, 0x000f); - regmap_write(rt5682->regmap, RT5682_PWR_DIG_1, 0x8d01); regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_2, 0x0321); regmap_write(rt5682->regmap, RT5682_HP_LOGIC_CTRL_2, 0x0004); regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_1, 0x7c00); regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_3, 0x06a1); regmap_write(rt5682->regmap, RT5682_A_DAC1_MUX, 0x0311); - regmap_write(rt5682->regmap, RT5682_RESET_HPF_CTRL, 0x0000); - regmap_write(rt5682->regmap, RT5682_ADC_STO1_HP_CTRL_1, 0x3320); + regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_1, 0x7c00); regmap_write(rt5682->regmap, RT5682_HP_CALIB_CTRL_1, 0xfc00); @@ -2490,8 +2506,12 @@ static void rt5682_calibrate(struct rt5682_priv *rt5682) pr_err("HP Calibration Failure\n"); /* restore settings */ - regmap_write(rt5682->regmap, RT5682_STO1_ADC_MIXER, 0xc0c4); + regmap_write(rt5682->regmap, RT5682_PWR_ANLG_1, 0x02af); + regmap_write(rt5682->regmap, RT5682_MICBIAS_2, 0x0080); + regmap_write(rt5682->regmap, RT5682_GLB_CLK, 0x0000); regmap_write(rt5682->regmap, RT5682_PWR_DIG_1, 0x0000); + regmap_write(rt5682->regmap, RT5682_CHOP_DAC, 0x2000); + regmap_write(rt5682->regmap, RT5682_CALIB_ADC_CTRL, 0x2005); mutex_unlock(&rt5682->calibrate_mutex); @@ -2565,7 +2585,7 @@ static int rt5682_i2c_probe(struct i2c_client *i2c, rt5682_calibrate(rt5682); - ret = regmap_register_patch(rt5682->regmap, patch_list, + ret = regmap_multi_reg_write(rt5682->regmap, patch_list, ARRAY_SIZE(patch_list)); if (ret != 0) dev_warn(&i2c->dev, "Failed to apply regmap patch: %d\n", ret); @@ -2619,6 +2639,10 @@ static int rt5682_i2c_probe(struct i2c_client *i2c, RT5682_GP4_PIN_MASK | RT5682_GP5_PIN_MASK, RT5682_GP4_PIN_ADCDAT1 | RT5682_GP5_PIN_DACDAT1); regmap_write(rt5682->regmap, RT5682_TEST_MODE_CTRL_1, 0x0000); + regmap_update_bits(rt5682->regmap, RT5682_BIAS_CUR_CTRL_8, + RT5682_HPA_CP_BIAS_CTRL_MASK, RT5682_HPA_CP_BIAS_3UA); + regmap_update_bits(rt5682->regmap, RT5682_CHARGE_PUMP_1, + RT5682_CP_CLK_HP_MASK, RT5682_CP_CLK_HP_300KHZ); INIT_DELAYED_WORK(&rt5682->jack_detect_work, rt5682_jack_detect_handler); @@ -2636,11 +2660,17 @@ static int rt5682_i2c_probe(struct i2c_client *i2c, } - return devm_snd_soc_register_component(&i2c->dev, - &soc_component_dev_rt5682, + return snd_soc_register_component(&i2c->dev, &soc_component_dev_rt5682, rt5682_dai, ARRAY_SIZE(rt5682_dai)); } +static int rt5682_i2c_remove(struct i2c_client *i2c) +{ + snd_soc_unregister_component(&i2c->dev); + + return 0; +} + static void rt5682_i2c_shutdown(struct i2c_client *client) { struct rt5682_priv *rt5682 = i2c_get_clientdata(client); @@ -2671,6 +2701,7 @@ static struct i2c_driver rt5682_i2c_driver = { .acpi_match_table = ACPI_PTR(rt5682_acpi_match), }, .probe = rt5682_i2c_probe, + .remove = rt5682_i2c_remove, .shutdown = rt5682_i2c_shutdown, .id_table = rt5682_i2c_id, }; diff --git a/sound/soc/codecs/rt5682.h b/sound/soc/codecs/rt5682.h index 8068140ebe3f..d82a8301fd74 100644 --- a/sound/soc/codecs/rt5682.h +++ b/sound/soc/codecs/rt5682.h @@ -1214,6 +1214,20 @@ #define RT5682_JDH_NO_PLUG (0x1 << 4) #define RT5682_JDH_PLUG (0x0 << 4) +/* Bias current control 8 (0x0111) */ +#define RT5682_HPA_CP_BIAS_CTRL_MASK (0x3 << 2) +#define RT5682_HPA_CP_BIAS_2UA (0x0 << 2) +#define RT5682_HPA_CP_BIAS_3UA (0x1 << 2) +#define RT5682_HPA_CP_BIAS_4UA (0x2 << 2) +#define RT5682_HPA_CP_BIAS_6UA (0x3 << 2) + +/* Charge Pump Internal Register1 (0x0125) */ +#define RT5682_CP_CLK_HP_MASK (0x3 << 4) +#define RT5682_CP_CLK_HP_100KHZ (0x0 << 4) +#define RT5682_CP_CLK_HP_200KHZ (0x1 << 4) +#define RT5682_CP_CLK_HP_300KHZ (0x2 << 4) +#define RT5682_CP_CLK_HP_600KHZ (0x3 << 4) + /* Chopper and Clock control for DAC (0x013a)*/ #define RT5682_CKXEN_DAC1_MASK (0x1 << 13) #define RT5682_CKXEN_DAC1_SFT 13 diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c index 60764f6201b1..add18d6d77da 100644 --- a/sound/soc/codecs/sgtl5000.c +++ b/sound/soc/codecs/sgtl5000.c @@ -1218,7 +1218,7 @@ static int sgtl5000_set_power_regs(struct snd_soc_component *component) * Searching for a suitable index solving this formula: * idx = 40 * log10(vag_val / lo_cagcntrl) + 15 */ - vol_quot = (vag * 100) / lo_vag; + vol_quot = lo_vag ? (vag * 100) / lo_vag : 0; lo_vol = 0; for (i = 0; i < ARRAY_SIZE(vol_quot_table); i++) { if (vol_quot >= vol_quot_table[i]) diff --git a/sound/soc/codecs/sta32x.c b/sound/soc/codecs/sta32x.c index d5035f2f2b2b..f753d2db0a5a 100644 --- a/sound/soc/codecs/sta32x.c +++ b/sound/soc/codecs/sta32x.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -142,6 +143,7 @@ static const char *sta32x_supply_names[] = { /* codec private data */ struct sta32x_priv { struct regmap *regmap; + struct clk *xti_clk; struct regulator_bulk_data supplies[ARRAY_SIZE(sta32x_supply_names)]; struct snd_soc_component *component; struct sta32x_platform_data *pdata; @@ -879,6 +881,18 @@ static int sta32x_probe(struct snd_soc_component *component) struct sta32x_priv *sta32x = snd_soc_component_get_drvdata(component); struct sta32x_platform_data *pdata = sta32x->pdata; int i, ret = 0, thermal = 0; + + sta32x->component = component; + + if (sta32x->xti_clk) { + ret = clk_prepare_enable(sta32x->xti_clk); + if (ret != 0) { + dev_err(component->dev, + "Failed to enable clock: %d\n", ret); + return ret; + } + } + ret = regulator_bulk_enable(ARRAY_SIZE(sta32x->supplies), sta32x->supplies); if (ret != 0) { @@ -981,6 +995,9 @@ static void sta32x_remove(struct snd_soc_component *component) sta32x_watchdog_stop(sta32x); regulator_bulk_disable(ARRAY_SIZE(sta32x->supplies), sta32x->supplies); + + if (sta32x->xti_clk) + clk_disable_unprepare(sta32x->xti_clk); } static const struct snd_soc_component_driver sta32x_component = { @@ -1038,6 +1055,8 @@ static int sta32x_probe_dt(struct device *dev, struct sta32x_priv *sta32x) of_property_read_u8(np, "st,ch3-output-mapping", &pdata->ch3_output_mapping); + if (of_get_property(np, "st,fault-detect-recovery", NULL)) + pdata->fault_detect_recovery = 1; if (of_get_property(np, "st,thermal-warning-recovery", NULL)) pdata->thermal_warning_recovery = 1; if (of_get_property(np, "st,thermal-warning-adjustment", NULL)) @@ -1095,6 +1114,17 @@ static int sta32x_i2c_probe(struct i2c_client *i2c, } #endif + /* Clock */ + sta32x->xti_clk = devm_clk_get(dev, "xti"); + if (IS_ERR(sta32x->xti_clk)) { + ret = PTR_ERR(sta32x->xti_clk); + + if (ret == -EPROBE_DEFER) + return ret; + + sta32x->xti_clk = NULL; + } + /* GPIOs */ sta32x->gpiod_nreset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); diff --git a/sound/soc/codecs/tas5720.c b/sound/soc/codecs/tas5720.c index ae3d032ac35a..6bd0e5d5347f 100644 --- a/sound/soc/codecs/tas5720.c +++ b/sound/soc/codecs/tas5720.c @@ -152,6 +152,7 @@ static int tas5720_set_dai_tdm_slot(struct snd_soc_dai *dai, int slots, int slot_width) { struct snd_soc_component *component = dai->component; + struct tas5720_data *tas5720 = snd_soc_component_get_drvdata(component); unsigned int first_slot; int ret; @@ -185,6 +186,20 @@ static int tas5720_set_dai_tdm_slot(struct snd_soc_dai *dai, if (ret < 0) goto error_snd_soc_component_update_bits; + /* Configure TDM slot width. This is only applicable to TAS5722. */ + switch (tas5720->devtype) { + case TAS5722: + ret = snd_soc_component_update_bits(component, TAS5722_DIGITAL_CTRL2_REG, + TAS5722_TDM_SLOT_16B, + slot_width == 16 ? + TAS5722_TDM_SLOT_16B : 0); + if (ret < 0) + goto error_snd_soc_component_update_bits; + break; + default: + break; + } + return 0; error_snd_soc_component_update_bits: @@ -485,15 +500,56 @@ static const DECLARE_TLV_DB_RANGE(dac_analog_tlv, ); /* - * DAC digital volumes. From -103.5 to 24 dB in 0.5 dB steps. Note that - * setting the gain below -100 dB (register value <0x7) is effectively a MUTE - * as per device datasheet. + * DAC digital volumes. From -103.5 to 24 dB in 0.5 dB or 0.25 dB steps + * depending on the device. Note that setting the gain below -100 dB + * (register value <0x7) is effectively a MUTE as per device datasheet. + * + * Note that for the TAS5722 the digital volume controls are actually split + * over two registers, so we need custom getters/setters for access. */ -static DECLARE_TLV_DB_SCALE(dac_tlv, -10350, 50, 0); +static DECLARE_TLV_DB_SCALE(tas5720_dac_tlv, -10350, 50, 0); +static DECLARE_TLV_DB_SCALE(tas5722_dac_tlv, -10350, 25, 0); + +static int tas5722_volume_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + unsigned int val; + + snd_soc_component_read(component, TAS5720_VOLUME_CTRL_REG, &val); + ucontrol->value.integer.value[0] = val << 1; + + snd_soc_component_read(component, TAS5722_DIGITAL_CTRL2_REG, &val); + ucontrol->value.integer.value[0] |= val & TAS5722_VOL_CONTROL_LSB; + + return 0; +} + +static int tas5722_volume_set(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + unsigned int sel = ucontrol->value.integer.value[0]; + + snd_soc_component_write(component, TAS5720_VOLUME_CTRL_REG, sel >> 1); + snd_soc_component_update_bits(component, TAS5722_DIGITAL_CTRL2_REG, + TAS5722_VOL_CONTROL_LSB, sel); + + return 0; +} static const struct snd_kcontrol_new tas5720_snd_controls[] = { SOC_SINGLE_TLV("Speaker Driver Playback Volume", - TAS5720_VOLUME_CTRL_REG, 0, 0xff, 0, dac_tlv), + TAS5720_VOLUME_CTRL_REG, 0, 0xff, 0, tas5720_dac_tlv), + SOC_SINGLE_TLV("Speaker Driver Analog Gain", TAS5720_ANALOG_CTRL_REG, + TAS5720_ANALOG_GAIN_SHIFT, 3, 0, dac_analog_tlv), +}; + +static const struct snd_kcontrol_new tas5722_snd_controls[] = { + SOC_SINGLE_EXT_TLV("Speaker Driver Playback Volume", + 0, 0, 511, 0, + tas5722_volume_get, tas5722_volume_set, + tas5722_dac_tlv), SOC_SINGLE_TLV("Speaker Driver Analog Gain", TAS5720_ANALOG_CTRL_REG, TAS5720_ANALOG_GAIN_SHIFT, 3, 0, dac_analog_tlv), }; @@ -527,6 +583,23 @@ static const struct snd_soc_component_driver soc_component_dev_tas5720 = { .non_legacy_dai_naming = 1, }; +static const struct snd_soc_component_driver soc_component_dev_tas5722 = { + .probe = tas5720_codec_probe, + .remove = tas5720_codec_remove, + .suspend = tas5720_suspend, + .resume = tas5720_resume, + .controls = tas5722_snd_controls, + .num_controls = ARRAY_SIZE(tas5722_snd_controls), + .dapm_widgets = tas5720_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(tas5720_dapm_widgets), + .dapm_routes = tas5720_audio_map, + .num_dapm_routes = ARRAY_SIZE(tas5720_audio_map), + .idle_bias_on = 1, + .use_pmdown_time = 1, + .endianness = 1, + .non_legacy_dai_naming = 1, +}; + /* PCM rates supported by the TAS5720 driver */ #define TAS5720_RATES (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000 |\ SNDRV_PCM_RATE_88200 | SNDRV_PCM_RATE_96000) @@ -613,9 +686,23 @@ static int tas5720_probe(struct i2c_client *client, dev_set_drvdata(dev, data); - ret = devm_snd_soc_register_component(&client->dev, - &soc_component_dev_tas5720, - tas5720_dai, ARRAY_SIZE(tas5720_dai)); + switch (id->driver_data) { + case TAS5720: + ret = devm_snd_soc_register_component(&client->dev, + &soc_component_dev_tas5720, + tas5720_dai, + ARRAY_SIZE(tas5720_dai)); + break; + case TAS5722: + ret = devm_snd_soc_register_component(&client->dev, + &soc_component_dev_tas5722, + tas5720_dai, + ARRAY_SIZE(tas5720_dai)); + break; + default: + dev_err(dev, "unexpected private driver data\n"); + return -EINVAL; + } if (ret < 0) { dev_err(dev, "failed to register component: %d\n", ret); return ret; diff --git a/sound/soc/codecs/tas6424.c b/sound/soc/codecs/tas6424.c index 0d6145549a98..36aebdb8f55c 100644 --- a/sound/soc/codecs/tas6424.c +++ b/sound/soc/codecs/tas6424.c @@ -41,6 +41,7 @@ struct tas6424_data { struct regmap *regmap; struct regulator_bulk_data supplies[TAS6424_NUM_SUPPLIES]; struct delayed_work fault_check_work; + unsigned int last_cfault; unsigned int last_fault1; unsigned int last_fault2; unsigned int last_warn; @@ -406,9 +407,54 @@ static void tas6424_fault_check_work(struct work_struct *work) unsigned int reg; int ret; + ret = regmap_read(tas6424->regmap, TAS6424_CHANNEL_FAULT, ®); + if (ret < 0) { + dev_err(dev, "failed to read CHANNEL_FAULT register: %d\n", ret); + goto out; + } + + if (!reg) { + tas6424->last_cfault = reg; + goto check_global_fault1_reg; + } + + /* + * Only flag errors once for a given occurrence. This is needed as + * the TAS6424 will take time clearing the fault condition internally + * during which we don't want to bombard the system with the same + * error message over and over. + */ + if ((reg & TAS6424_FAULT_OC_CH1) && !(tas6424->last_cfault & TAS6424_FAULT_OC_CH1)) + dev_crit(dev, "experienced a channel 1 overcurrent fault\n"); + + if ((reg & TAS6424_FAULT_OC_CH2) && !(tas6424->last_cfault & TAS6424_FAULT_OC_CH2)) + dev_crit(dev, "experienced a channel 2 overcurrent fault\n"); + + if ((reg & TAS6424_FAULT_OC_CH3) && !(tas6424->last_cfault & TAS6424_FAULT_OC_CH3)) + dev_crit(dev, "experienced a channel 3 overcurrent fault\n"); + + if ((reg & TAS6424_FAULT_OC_CH4) && !(tas6424->last_cfault & TAS6424_FAULT_OC_CH4)) + dev_crit(dev, "experienced a channel 4 overcurrent fault\n"); + + if ((reg & TAS6424_FAULT_DC_CH1) && !(tas6424->last_cfault & TAS6424_FAULT_DC_CH1)) + dev_crit(dev, "experienced a channel 1 DC fault\n"); + + if ((reg & TAS6424_FAULT_DC_CH2) && !(tas6424->last_cfault & TAS6424_FAULT_DC_CH2)) + dev_crit(dev, "experienced a channel 2 DC fault\n"); + + if ((reg & TAS6424_FAULT_DC_CH3) && !(tas6424->last_cfault & TAS6424_FAULT_DC_CH3)) + dev_crit(dev, "experienced a channel 3 DC fault\n"); + + if ((reg & TAS6424_FAULT_DC_CH4) && !(tas6424->last_cfault & TAS6424_FAULT_DC_CH4)) + dev_crit(dev, "experienced a channel 4 DC fault\n"); + + /* Store current fault1 value so we can detect any changes next time */ + tas6424->last_cfault = reg; + +check_global_fault1_reg: ret = regmap_read(tas6424->regmap, TAS6424_GLOB_FAULT1, ®); if (ret < 0) { - dev_err(dev, "failed to read FAULT1 register: %d\n", ret); + dev_err(dev, "failed to read GLOB_FAULT1 register: %d\n", ret); goto out; } @@ -429,12 +475,6 @@ static void tas6424_fault_check_work(struct work_struct *work) goto check_global_fault2_reg; } - /* - * Only flag errors once for a given occurrence. This is needed as - * the TAS6424 will take time clearing the fault condition internally - * during which we don't want to bombard the system with the same - * error message over and over. - */ if ((reg & TAS6424_FAULT_PVDD_OV) && !(tas6424->last_fault1 & TAS6424_FAULT_PVDD_OV)) dev_crit(dev, "experienced a PVDD overvoltage fault\n"); @@ -453,7 +493,7 @@ static void tas6424_fault_check_work(struct work_struct *work) check_global_fault2_reg: ret = regmap_read(tas6424->regmap, TAS6424_GLOB_FAULT2, ®); if (ret < 0) { - dev_err(dev, "failed to read FAULT2 register: %d\n", ret); + dev_err(dev, "failed to read GLOB_FAULT2 register: %d\n", ret); goto out; } @@ -530,7 +570,7 @@ check_warn_reg: /* Store current warn value so we can detect any changes next time */ tas6424->last_warn = reg; - /* Clear any faults by toggling the CLEAR_FAULT control bit */ + /* Clear any warnings by toggling the CLEAR_FAULT control bit */ ret = regmap_write_bits(tas6424->regmap, TAS6424_MISC_CTRL3, TAS6424_CLEAR_FAULT, TAS6424_CLEAR_FAULT); if (ret < 0) diff --git a/sound/soc/codecs/tas6424.h b/sound/soc/codecs/tas6424.h index b5958c45ed0e..c67a7835ca66 100644 --- a/sound/soc/codecs/tas6424.h +++ b/sound/soc/codecs/tas6424.h @@ -115,6 +115,16 @@ #define TAS6424_LDGBYPASS_SHIFT 0 #define TAS6424_LDGBYPASS_MASK BIT(TAS6424_LDGBYPASS_SHIFT) +/* TAS6424_GLOB_FAULT1_REG */ +#define TAS6424_FAULT_OC_CH1 BIT(7) +#define TAS6424_FAULT_OC_CH2 BIT(6) +#define TAS6424_FAULT_OC_CH3 BIT(5) +#define TAS6424_FAULT_OC_CH4 BIT(4) +#define TAS6424_FAULT_DC_CH1 BIT(3) +#define TAS6424_FAULT_DC_CH2 BIT(2) +#define TAS6424_FAULT_DC_CH3 BIT(1) +#define TAS6424_FAULT_DC_CH4 BIT(0) + /* TAS6424_GLOB_FAULT1_REG */ #define TAS6424_FAULT_CLOCK BIT(4) #define TAS6424_FAULT_PVDD_OV BIT(3) diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c index bf92d36b8f8a..608ad49ad978 100644 --- a/sound/soc/codecs/tlv320aic31xx.c +++ b/sound/soc/codecs/tlv320aic31xx.c @@ -167,6 +167,7 @@ struct aic31xx_priv { u8 p_div; int rate_div_line; bool master_dapm_route_applied; + int irq; }; struct aic31xx_rate_divs { @@ -1391,6 +1392,69 @@ static const struct acpi_device_id aic31xx_acpi_match[] = { MODULE_DEVICE_TABLE(acpi, aic31xx_acpi_match); #endif +static irqreturn_t aic31xx_irq(int irq, void *data) +{ + struct aic31xx_priv *aic31xx = data; + struct device *dev = aic31xx->dev; + unsigned int value; + bool handled = false; + int ret; + + ret = regmap_read(aic31xx->regmap, AIC31XX_INTRDACFLAG, &value); + if (ret) { + dev_err(dev, "Failed to read interrupt mask: %d\n", ret); + goto exit; + } + + if (value) + handled = true; + else + goto read_overflow; + + if (value & AIC31XX_HPLSCDETECT) + dev_err(dev, "Short circuit on Left output is detected\n"); + if (value & AIC31XX_HPRSCDETECT) + dev_err(dev, "Short circuit on Right output is detected\n"); + if (value & ~(AIC31XX_HPLSCDETECT | + AIC31XX_HPRSCDETECT)) + dev_err(dev, "Unknown DAC interrupt flags: 0x%08x\n", value); + +read_overflow: + ret = regmap_read(aic31xx->regmap, AIC31XX_OFFLAG, &value); + if (ret) { + dev_err(dev, "Failed to read overflow flag: %d\n", ret); + goto exit; + } + + if (value) + handled = true; + else + goto exit; + + if (value & AIC31XX_DAC_OF_LEFT) + dev_warn(dev, "Left-channel DAC overflow has occurred\n"); + if (value & AIC31XX_DAC_OF_RIGHT) + dev_warn(dev, "Right-channel DAC overflow has occurred\n"); + if (value & AIC31XX_DAC_OF_SHIFTER) + dev_warn(dev, "DAC barrel shifter overflow has occurred\n"); + if (value & AIC31XX_ADC_OF) + dev_warn(dev, "ADC overflow has occurred\n"); + if (value & AIC31XX_ADC_OF_SHIFTER) + dev_warn(dev, "ADC barrel shifter overflow has occurred\n"); + if (value & ~(AIC31XX_DAC_OF_LEFT | + AIC31XX_DAC_OF_RIGHT | + AIC31XX_DAC_OF_SHIFTER | + AIC31XX_ADC_OF | + AIC31XX_ADC_OF_SHIFTER)) + dev_warn(dev, "Unknown overflow interrupt flags: 0x%08x\n", value); + +exit: + if (handled) + return IRQ_HANDLED; + else + return IRQ_NONE; +} + static int aic31xx_i2c_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { @@ -1413,6 +1477,7 @@ static int aic31xx_i2c_probe(struct i2c_client *i2c, return ret; } aic31xx->dev = &i2c->dev; + aic31xx->irq = i2c->irq; aic31xx->codec_type = id->driver_data; @@ -1456,6 +1521,26 @@ static int aic31xx_i2c_probe(struct i2c_client *i2c, return ret; } + if (aic31xx->irq > 0) { + regmap_update_bits(aic31xx->regmap, AIC31XX_GPIO1, + AIC31XX_GPIO1_FUNC_MASK, + AIC31XX_GPIO1_INT1 << + AIC31XX_GPIO1_FUNC_SHIFT); + + regmap_write(aic31xx->regmap, AIC31XX_INT1CTRL, + AIC31XX_SC | + AIC31XX_ENGINE); + + ret = devm_request_threaded_irq(aic31xx->dev, aic31xx->irq, + NULL, aic31xx_irq, + IRQF_ONESHOT, "aic31xx-irq", + aic31xx); + if (ret) { + dev_err(aic31xx->dev, "Unable to request IRQ\n"); + return ret; + } + } + if (aic31xx->codec_type & DAC31XX_BIT) return devm_snd_soc_register_component(&i2c->dev, &soc_codec_driver_aic31xx, diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h index 0b587585b38b..2636f2c6bc79 100644 --- a/sound/soc/codecs/tlv320aic31xx.h +++ b/sound/soc/codecs/tlv320aic31xx.h @@ -173,6 +173,13 @@ struct aic31xx_pdata { #define AIC31XX_HPRDRVPWRSTATUS_MASK BIT(1) #define AIC31XX_SPRDRVPWRSTATUS_MASK BIT(0) +/* AIC31XX_OFFLAG */ +#define AIC31XX_DAC_OF_LEFT BIT(7) +#define AIC31XX_DAC_OF_RIGHT BIT(6) +#define AIC31XX_DAC_OF_SHIFTER BIT(5) +#define AIC31XX_ADC_OF BIT(3) +#define AIC31XX_ADC_OF_SHIFTER BIT(1) + /* AIC31XX_INTRDACFLAG */ #define AIC31XX_HPLSCDETECT BIT(7) #define AIC31XX_HPRSCDETECT BIT(6) @@ -191,6 +198,22 @@ struct aic31xx_pdata { #define AIC31XX_SC BIT(3) #define AIC31XX_ENGINE BIT(2) +/* AIC31XX_GPIO1 */ +#define AIC31XX_GPIO1_FUNC_MASK GENMASK(5, 2) +#define AIC31XX_GPIO1_FUNC_SHIFT 2 +#define AIC31XX_GPIO1_DISABLED 0x00 +#define AIC31XX_GPIO1_INPUT 0x01 +#define AIC31XX_GPIO1_GPI 0x02 +#define AIC31XX_GPIO1_GPO 0x03 +#define AIC31XX_GPIO1_CLKOUT 0x04 +#define AIC31XX_GPIO1_INT1 0x05 +#define AIC31XX_GPIO1_INT2 0x06 +#define AIC31XX_GPIO1_ADC_WCLK 0x07 +#define AIC31XX_GPIO1_SBCLK 0x08 +#define AIC31XX_GPIO1_SWCLK 0x09 +#define AIC31XX_GPIO1_ADC_MOD_CLK 0x10 +#define AIC31XX_GPIO1_SDOUT 0x11 + /* AIC31XX_DACSETUP */ #define AIC31XX_SOFTSTEP_MASK GENMASK(1, 0) diff --git a/sound/soc/codecs/tscs454.c b/sound/soc/codecs/tscs454.c index ff85a0bf6170..93d84e5ae2d5 100644 --- a/sound/soc/codecs/tscs454.c +++ b/sound/soc/codecs/tscs454.c @@ -3459,7 +3459,7 @@ static int tscs454_i2c_probe(struct i2c_client *i2c, /* Sync pg sel reg with cache */ regmap_write(tscs454->regmap, R_PAGESEL, 0x00); - ret = snd_soc_register_component(&i2c->dev, &soc_component_dev_tscs454, + ret = devm_snd_soc_register_component(&i2c->dev, &soc_component_dev_tscs454, tscs454_dais, ARRAY_SIZE(tscs454_dais)); if (ret) { dev_err(&i2c->dev, "Failed to register component (%d)\n", ret); diff --git a/sound/soc/codecs/wm2000.c b/sound/soc/codecs/wm2000.c index c5ae07234a00..bba330e30162 100644 --- a/sound/soc/codecs/wm2000.c +++ b/sound/soc/codecs/wm2000.c @@ -88,19 +88,6 @@ static int wm2000_write(struct i2c_client *i2c, unsigned int reg, return regmap_write(wm2000->regmap, reg, value); } -static unsigned int wm2000_read(struct i2c_client *i2c, unsigned int r) -{ - struct wm2000_priv *wm2000 = i2c_get_clientdata(i2c); - unsigned int val; - int ret; - - ret = regmap_read(wm2000->regmap, r, &val); - if (ret < 0) - return -1; - - return val; -} - static void wm2000_reset(struct wm2000_priv *wm2000) { struct i2c_client *i2c = wm2000->i2c; @@ -115,14 +102,15 @@ static void wm2000_reset(struct wm2000_priv *wm2000) static int wm2000_poll_bit(struct i2c_client *i2c, unsigned int reg, u8 mask) { + struct wm2000_priv *wm2000 = i2c_get_clientdata(i2c); int timeout = 4000; - int val; + unsigned int val; - val = wm2000_read(i2c, reg); + regmap_read(wm2000->regmap, reg, &val); while (!(val & mask) && --timeout) { msleep(1); - val = wm2000_read(i2c, reg); + regmap_read(wm2000->regmap, reg, &val); } if (timeout == 0) @@ -135,6 +123,7 @@ static int wm2000_power_up(struct i2c_client *i2c, int analogue) { struct wm2000_priv *wm2000 = dev_get_drvdata(&i2c->dev); unsigned long rate; + unsigned int val; int ret; if (WARN_ON(wm2000->anc_mode != ANC_OFF)) @@ -213,12 +202,17 @@ static int wm2000_power_up(struct i2c_client *i2c, int analogue) WM2000_MODE_THERMAL_ENABLE); } - ret = wm2000_read(i2c, WM2000_REG_SPEECH_CLARITY); + ret = regmap_read(wm2000->regmap, WM2000_REG_SPEECH_CLARITY, &val); + if (ret != 0) { + dev_err(&i2c->dev, "Unable to read Speech Clarity: %d\n", ret); + regulator_bulk_disable(WM2000_NUM_SUPPLIES, wm2000->supplies); + return ret; + } if (wm2000->speech_clarity) - ret |= WM2000_SPEECH_CLARITY; + val |= WM2000_SPEECH_CLARITY; else - ret &= ~WM2000_SPEECH_CLARITY; - wm2000_write(i2c, WM2000_REG_SPEECH_CLARITY, ret); + val &= ~WM2000_SPEECH_CLARITY; + wm2000_write(i2c, WM2000_REG_SPEECH_CLARITY, val); wm2000_write(i2c, WM2000_REG_SYS_START0, 0x33); wm2000_write(i2c, WM2000_REG_SYS_START1, 0x02); @@ -824,7 +818,7 @@ static int wm2000_i2c_probe(struct i2c_client *i2c, const char *filename; const struct firmware *fw = NULL; int ret, i; - int reg; + unsigned int reg; u16 id; wm2000 = devm_kzalloc(&i2c->dev, sizeof(*wm2000), GFP_KERNEL); @@ -860,9 +854,17 @@ static int wm2000_i2c_probe(struct i2c_client *i2c, } /* Verify that this is a WM2000 */ - reg = wm2000_read(i2c, WM2000_REG_ID1); + ret = regmap_read(wm2000->regmap, WM2000_REG_ID1, ®); + if (ret != 0) { + dev_err(&i2c->dev, "Unable to read ID1: %d\n", ret); + return ret; + } id = reg << 8; - reg = wm2000_read(i2c, WM2000_REG_ID2); + ret = regmap_read(wm2000->regmap, WM2000_REG_ID2, ®); + if (ret != 0) { + dev_err(&i2c->dev, "Unable to read ID2: %d\n", ret); + return ret; + } id |= reg & 0xff; if (id != 0x2000) { @@ -871,7 +873,11 @@ static int wm2000_i2c_probe(struct i2c_client *i2c, goto err_supplies; } - reg = wm2000_read(i2c, WM2000_REG_REVISON); + ret = regmap_read(wm2000->regmap, WM2000_REG_REVISON, ®); + if (ret != 0) { + dev_err(&i2c->dev, "Unable to read Revision: %d\n", ret); + return ret; + } dev_info(&i2c->dev, "revision %c\n", reg + 'A'); wm2000->mclk = devm_clk_get(&i2c->dev, "MCLK"); diff --git a/sound/soc/codecs/wm8782.c b/sound/soc/codecs/wm8782.c index 317db9a149a7..cf2cdbece122 100644 --- a/sound/soc/codecs/wm8782.c +++ b/sound/soc/codecs/wm8782.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,51 @@ static struct snd_soc_dai_driver wm8782_dai = { }, }; +/* regulator power supply names */ +static const char *supply_names[] = { + "Vdda", /* analog supply, 2.7V - 3.6V */ + "Vdd", /* digital supply, 2.7V - 5.5V */ +}; + +struct wm8782_priv { + struct regulator_bulk_data supplies[ARRAY_SIZE(supply_names)]; +}; + +static int wm8782_soc_probe(struct snd_soc_component *component) +{ + struct wm8782_priv *priv = snd_soc_component_get_drvdata(component); + return regulator_bulk_enable(ARRAY_SIZE(priv->supplies), priv->supplies); +} + +static void wm8782_soc_remove(struct snd_soc_component *component) +{ + struct wm8782_priv *priv = snd_soc_component_get_drvdata(component); + regulator_bulk_disable(ARRAY_SIZE(priv->supplies), priv->supplies); +} + +#ifdef CONFIG_PM +static int wm8782_soc_suspend(struct snd_soc_component *component) +{ + struct wm8782_priv *priv = snd_soc_component_get_drvdata(component); + regulator_bulk_disable(ARRAY_SIZE(priv->supplies), priv->supplies); + return 0; +} + +static int wm8782_soc_resume(struct snd_soc_component *component) +{ + struct wm8782_priv *priv = snd_soc_component_get_drvdata(component); + return regulator_bulk_enable(ARRAY_SIZE(priv->supplies), priv->supplies); +} +#else +#define wm8782_soc_suspend NULL +#define wm8782_soc_resume NULL +#endif /* CONFIG_PM */ + static const struct snd_soc_component_driver soc_component_dev_wm8782 = { + .probe = wm8782_soc_probe, + .remove = wm8782_soc_remove, + .suspend = wm8782_soc_suspend, + .resume = wm8782_soc_resume, .dapm_widgets = wm8782_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(wm8782_dapm_widgets), .dapm_routes = wm8782_dapm_routes, @@ -63,6 +108,24 @@ static const struct snd_soc_component_driver soc_component_dev_wm8782 = { static int wm8782_probe(struct platform_device *pdev) { + struct device *dev = &pdev->dev; + struct wm8782_priv *priv; + int ret, i; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + dev_set_drvdata(dev, priv); + + for (i = 0; i < ARRAY_SIZE(supply_names); i++) + priv->supplies[i].supply = supply_names[i]; + + ret = devm_regulator_bulk_get(dev, ARRAY_SIZE(priv->supplies), + priv->supplies); + if (ret < 0) + return ret; + return devm_snd_soc_register_component(&pdev->dev, &soc_component_dev_wm8782, &wm8782_dai, 1); } diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c index 1965635ec07c..2a3e5fbd04e4 100644 --- a/sound/soc/codecs/wm8904.c +++ b/sound/soc/codecs/wm8904.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/sound/soc/codecs/wm8974.c b/sound/soc/codecs/wm8974.c index 43edaf8cd276..593a11960888 100644 --- a/sound/soc/codecs/wm8974.c +++ b/sound/soc/codecs/wm8974.c @@ -11,7 +11,6 @@ */ #include -#include #include #include #include diff --git a/sound/soc/codecs/wm9712.c b/sound/soc/codecs/wm9712.c index ade34c26ad2f..e873baa9e778 100644 --- a/sound/soc/codecs/wm9712.c +++ b/sound/soc/codecs/wm9712.c @@ -638,13 +638,14 @@ static int wm9712_soc_probe(struct snd_soc_component *component) { struct wm9712_priv *wm9712 = snd_soc_component_get_drvdata(component); struct regmap *regmap; - int ret; if (wm9712->mfd_pdata) { wm9712->ac97 = wm9712->mfd_pdata->ac97; regmap = wm9712->mfd_pdata->regmap; } else { #ifdef CONFIG_SND_SOC_AC97_BUS + int ret; + wm9712->ac97 = snd_soc_new_ac97_component(component, WM9712_VENDOR_ID, WM9712_VENDOR_ID_MASK); if (IS_ERR(wm9712->ac97)) { diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index f61656070225..a53dc174bbf0 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -311,12 +311,12 @@ struct wm_adsp_alg_xm_struct { }; struct wm_adsp_buffer { - __be32 X_buf_base; /* XM base addr of first X area */ - __be32 X_buf_size; /* Size of 1st X area in words */ - __be32 X_buf_base2; /* XM base addr of 2nd X area */ - __be32 X_buf_brk; /* Total X size in words */ - __be32 Y_buf_base; /* YM base addr of Y area */ - __be32 wrap; /* Total size X and Y in words */ + __be32 buf1_base; /* Base addr of first buffer area */ + __be32 buf1_size; /* Size of buf1 area in DSP words */ + __be32 buf2_base; /* Base addr of 2nd buffer area */ + __be32 buf1_buf2_size; /* Size of buf1+buf2 in DSP words */ + __be32 buf3_base; /* Base addr of buf3 area */ + __be32 buf_total_size; /* Size of buf1+buf2+buf3 in DSP words */ __be32 high_water_mark; /* Point at which IRQ is asserted */ __be32 irq_count; /* bits 1-31 count IRQ assertions */ __be32 irq_ack; /* acked IRQ count, bit 0 enables IRQ */ @@ -393,18 +393,18 @@ struct wm_adsp_buffer_region_def { static const struct wm_adsp_buffer_region_def default_regions[] = { { .mem_type = WMFW_ADSP2_XM, - .base_offset = HOST_BUFFER_FIELD(X_buf_base), - .size_offset = HOST_BUFFER_FIELD(X_buf_size), + .base_offset = HOST_BUFFER_FIELD(buf1_base), + .size_offset = HOST_BUFFER_FIELD(buf1_size), }, { .mem_type = WMFW_ADSP2_XM, - .base_offset = HOST_BUFFER_FIELD(X_buf_base2), - .size_offset = HOST_BUFFER_FIELD(X_buf_brk), + .base_offset = HOST_BUFFER_FIELD(buf2_base), + .size_offset = HOST_BUFFER_FIELD(buf1_buf2_size), }, { .mem_type = WMFW_ADSP2_YM, - .base_offset = HOST_BUFFER_FIELD(Y_buf_base), - .size_offset = HOST_BUFFER_FIELD(wrap), + .base_offset = HOST_BUFFER_FIELD(buf3_base), + .size_offset = HOST_BUFFER_FIELD(buf_total_size), }, }; @@ -3345,7 +3345,7 @@ static int wm_adsp_buffer_populate(struct wm_adsp_compr_buf *buf) region->cumulative_size = offset; adsp_dbg(buf->dsp, - "region=%d type=%d base=%04x off=%04x size=%04x\n", + "region=%d type=%d base=%08x off=%08x size=%08x\n", i, region->mem_type, region->base_addr, region->offset, region->cumulative_size); } diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index f70db8412c7c..267aee776b2d 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -1041,6 +1041,42 @@ static int davinci_mcasp_calc_clk_div(struct davinci_mcasp *mcasp, return error_ppm; } +static inline u32 davinci_mcasp_tx_delay(struct davinci_mcasp *mcasp) +{ + if (!mcasp->txnumevt) + return 0; + + return mcasp_get_reg(mcasp, mcasp->fifo_base + MCASP_WFIFOSTS_OFFSET); +} + +static inline u32 davinci_mcasp_rx_delay(struct davinci_mcasp *mcasp) +{ + if (!mcasp->rxnumevt) + return 0; + + return mcasp_get_reg(mcasp, mcasp->fifo_base + MCASP_RFIFOSTS_OFFSET); +} + +static snd_pcm_sframes_t davinci_mcasp_delay( + struct snd_pcm_substream *substream, + struct snd_soc_dai *cpu_dai) +{ + struct davinci_mcasp *mcasp = snd_soc_dai_get_drvdata(cpu_dai); + u32 fifo_use; + + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + fifo_use = davinci_mcasp_tx_delay(mcasp); + else + fifo_use = davinci_mcasp_rx_delay(mcasp); + + /* + * Divide the used locations with the channel count to get the + * FIFO usage in samples (don't care about partial samples in the + * buffer). + */ + return fifo_use / substream->runtime->channels; +} + static int davinci_mcasp_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *cpu_dai) @@ -1365,6 +1401,7 @@ static const struct snd_soc_dai_ops davinci_mcasp_dai_ops = { .startup = davinci_mcasp_startup, .shutdown = davinci_mcasp_shutdown, .trigger = davinci_mcasp_trigger, + .delay = davinci_mcasp_delay, .hw_params = davinci_mcasp_hw_params, .set_fmt = davinci_mcasp_set_dai_fmt, .set_clkdiv = davinci_mcasp_set_clkdiv, diff --git a/sound/soc/fsl/fsl_asrc_dma.c b/sound/soc/fsl/fsl_asrc_dma.c index 1033ac6631b0..01052a0808b0 100644 --- a/sound/soc/fsl/fsl_asrc_dma.c +++ b/sound/soc/fsl/fsl_asrc_dma.c @@ -151,7 +151,7 @@ static int fsl_asrc_dma_hw_params(struct snd_pcm_substream *substream, int ret; /* Fetch the Back-End dma_data from DPCM */ - list_for_each_entry(dpcm, &rtd->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(rtd, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *substream_be; struct snd_soc_dai *dai = be->cpu_dai; diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c index c1d1d06783e5..57b484768a58 100644 --- a/sound/soc/fsl/fsl_esai.c +++ b/sound/soc/fsl/fsl_esai.c @@ -807,7 +807,7 @@ static int fsl_esai_probe(struct platform_device *pdev) return -ENOMEM; esai_priv->pdev = pdev; - strncpy(esai_priv->name, np->name, sizeof(esai_priv->name) - 1); + snprintf(esai_priv->name, sizeof(esai_priv->name), "%pOFn", np); /* Get the addresses and IRQ */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); diff --git a/sound/soc/fsl/fsl_utils.c b/sound/soc/fsl/fsl_utils.c index 7f0fa4b52223..9981668ab590 100644 --- a/sound/soc/fsl/fsl_utils.c +++ b/sound/soc/fsl/fsl_utils.c @@ -57,8 +57,8 @@ int fsl_asoc_get_dma_channel(struct device_node *ssi_np, of_node_put(dma_channel_np); return ret; } - snprintf((char *)dai->platform_name, DAI_NAME_SIZE, "%llx.%s", - (unsigned long long) res.start, dma_channel_np->name); + snprintf((char *)dai->platform_name, DAI_NAME_SIZE, "%llx.%pOFn", + (unsigned long long) res.start, dma_channel_np); iprop = of_get_property(dma_channel_np, "cell-index", NULL); if (!iprop) { diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c index ec731223cab3..e339f36cea95 100644 --- a/sound/soc/fsl/pcm030-audio-fabric.c +++ b/sound/soc/fsl/pcm030-audio-fabric.c @@ -57,6 +57,7 @@ static int pcm030_fabric_probe(struct platform_device *op) struct device_node *platform_np; struct snd_soc_card *card = &pcm030_card; struct pcm030_audio_data *pdata; + struct snd_soc_dai_link *dai_link; int ret; int i; @@ -78,8 +79,8 @@ static int pcm030_fabric_probe(struct platform_device *op) return -ENODEV; } - for (i = 0; i < card->num_links; i++) - card->dai_link[i].platform_of_node = platform_np; + for_each_card_prelinks(card, i, dai_link) + dai_link->platform_of_node = platform_np; ret = request_module("snd-soc-wm9712"); if (ret) diff --git a/sound/soc/generic/audio-graph-card.c b/sound/soc/generic/audio-graph-card.c index 2094d2c8919f..25c819e402e1 100644 --- a/sound/soc/generic/audio-graph-card.c +++ b/sound/soc/generic/audio-graph-card.c @@ -25,6 +25,8 @@ struct graph_card_data { struct graph_dai_props { struct asoc_simple_dai cpu_dai; struct asoc_simple_dai codec_dai; + struct snd_soc_dai_link_component codecs; /* single codec */ + struct snd_soc_dai_link_component platform; unsigned int mclk_fs; } *dai_props; unsigned int mclk_fs; @@ -180,7 +182,8 @@ static int asoc_graph_card_dai_link_of(struct device_node *cpu_port, if (ret < 0) goto dai_link_of_err; - of_property_read_u32(rcpu_ep, "mclk-fs", &dai_props->mclk_fs); + of_property_read_u32(cpu_ep, "mclk-fs", &dai_props->mclk_fs); + of_property_read_u32(codec_ep, "mclk-fs", &dai_props->mclk_fs); ret = asoc_simple_card_parse_graph_cpu(cpu_ep, dai_link); if (ret < 0) @@ -213,7 +216,7 @@ static int asoc_graph_card_dai_link_of(struct device_node *cpu_port, ret = asoc_simple_card_set_dailink_name(dev, dai_link, "%s-%s", dai_link->cpu_dai_name, - dai_link->codec_dai_name); + dai_link->codecs->dai_name); if (ret < 0) goto dai_link_of_err; @@ -299,7 +302,7 @@ static int asoc_graph_card_probe(struct platform_device *pdev) struct graph_dai_props *dai_props; struct device *dev = &pdev->dev; struct snd_soc_card *card; - int num, ret; + int num, ret, i; /* Allocate the private data and the DAI link array */ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -315,6 +318,18 @@ static int asoc_graph_card_probe(struct platform_device *pdev) if (!dai_props || !dai_link) return -ENOMEM; + /* + * Use snd_soc_dai_link_component instead of legacy style + * It is codec only. but cpu/platform will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + for (i = 0; i < num; i++) { + dai_link[i].codecs = &dai_props[i].codecs; + dai_link[i].num_codecs = 1; + dai_link[i].platform = &dai_props[i].platform; + } + priv->pa_gpio = devm_gpiod_get_optional(dev, "pa", GPIOD_OUT_LOW); if (IS_ERR(priv->pa_gpio)) { ret = PTR_ERR(priv->pa_gpio); diff --git a/sound/soc/generic/audio-graph-scu-card.c b/sound/soc/generic/audio-graph-scu-card.c index 92882e392d6c..b83bb31021a9 100644 --- a/sound/soc/generic/audio-graph-scu-card.c +++ b/sound/soc/generic/audio-graph-scu-card.c @@ -25,7 +25,11 @@ struct graph_card_data { struct snd_soc_card snd_card; struct snd_soc_codec_conf codec_conf; - struct asoc_simple_dai *dai_props; + struct graph_dai_props { + struct asoc_simple_dai dai; + struct snd_soc_dai_link_component codecs; + struct snd_soc_dai_link_component platform; + } *dai_props; struct snd_soc_dai_link *dai_link; struct asoc_simple_card_data adata; }; @@ -39,18 +43,18 @@ static int asoc_graph_card_startup(struct snd_pcm_substream *substream) { struct snd_soc_pcm_runtime *rtd = substream->private_data; struct graph_card_data *priv = snd_soc_card_get_drvdata(rtd->card); - struct asoc_simple_dai *dai_props = graph_priv_to_props(priv, rtd->num); + struct graph_dai_props *dai_props = graph_priv_to_props(priv, rtd->num); - return asoc_simple_card_clk_enable(dai_props); + return asoc_simple_card_clk_enable(&dai_props->dai); } static void asoc_graph_card_shutdown(struct snd_pcm_substream *substream) { struct snd_soc_pcm_runtime *rtd = substream->private_data; struct graph_card_data *priv = snd_soc_card_get_drvdata(rtd->card); - struct asoc_simple_dai *dai_props = graph_priv_to_props(priv, rtd->num); + struct graph_dai_props *dai_props = graph_priv_to_props(priv, rtd->num); - asoc_simple_card_clk_disable(dai_props); + asoc_simple_card_clk_disable(&dai_props->dai); } static const struct snd_soc_ops asoc_graph_card_ops = { @@ -63,7 +67,7 @@ static int asoc_graph_card_dai_init(struct snd_soc_pcm_runtime *rtd) struct graph_card_data *priv = snd_soc_card_get_drvdata(rtd->card); struct snd_soc_dai *dai; struct snd_soc_dai_link *dai_link; - struct asoc_simple_dai *dai_props; + struct graph_dai_props *dai_props; int num = rtd->num; dai_link = graph_priv_to_link(priv, num); @@ -72,7 +76,7 @@ static int asoc_graph_card_dai_init(struct snd_soc_pcm_runtime *rtd) rtd->cpu_dai : rtd->codec_dai; - return asoc_simple_card_init_dai(dai, dai_props); + return asoc_simple_card_init_dai(dai, &dai_props->dai); } static int asoc_graph_card_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, @@ -92,15 +96,18 @@ static int asoc_graph_card_dai_link_of(struct device_node *ep, { struct device *dev = graph_priv_to_dev(priv); struct snd_soc_dai_link *dai_link = graph_priv_to_link(priv, idx); - struct asoc_simple_dai *dai_props = graph_priv_to_props(priv, idx); + struct graph_dai_props *dai_props = graph_priv_to_props(priv, idx); struct snd_soc_card *card = graph_priv_to_card(priv); int ret; if (is_fe) { + struct snd_soc_dai_link_component *codecs; + /* BE is dummy */ - dai_link->codec_of_node = NULL; - dai_link->codec_dai_name = "snd-soc-dummy-dai"; - dai_link->codec_name = "snd-soc-dummy"; + codecs = dai_link->codecs; + codecs->of_node = NULL; + codecs->dai_name = "snd-soc-dummy-dai"; + codecs->name = "snd-soc-dummy"; /* FE settings */ dai_link->dynamic = 1; @@ -110,7 +117,7 @@ static int asoc_graph_card_dai_link_of(struct device_node *ep, if (ret) return ret; - ret = asoc_simple_card_parse_clk_cpu(dev, ep, dai_link, dai_props); + ret = asoc_simple_card_parse_clk_cpu(dev, ep, dai_link, &dai_props->dai); if (ret < 0) return ret; @@ -137,23 +144,23 @@ static int asoc_graph_card_dai_link_of(struct device_node *ep, if (ret < 0) return ret; - ret = asoc_simple_card_parse_clk_codec(dev, ep, dai_link, dai_props); + ret = asoc_simple_card_parse_clk_codec(dev, ep, dai_link, &dai_props->dai); if (ret < 0) return ret; ret = asoc_simple_card_set_dailink_name(dev, dai_link, "be.%s", - dai_link->codec_dai_name); + dai_link->codecs->dai_name); if (ret < 0) return ret; snd_soc_of_parse_audio_prefix(card, &priv->codec_conf, - dai_link->codec_of_node, + dai_link->codecs->of_node, "prefix"); } - ret = asoc_simple_card_of_parse_tdm(ep, dai_props); + ret = asoc_simple_card_of_parse_tdm(ep, &dai_props->dai); if (ret) return ret; @@ -331,10 +338,10 @@ static int asoc_graph_card_probe(struct platform_device *pdev) { struct graph_card_data *priv; struct snd_soc_dai_link *dai_link; - struct asoc_simple_dai *dai_props; + struct graph_dai_props *dai_props; struct device *dev = &pdev->dev; struct snd_soc_card *card; - int num, ret; + int num, ret, i; /* Allocate the private data and the DAI link array */ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -350,6 +357,18 @@ static int asoc_graph_card_probe(struct platform_device *pdev) if (!dai_props || !dai_link) return -ENOMEM; + /* + * Use snd_soc_dai_link_component instead of legacy style + * It is codec only. but cpu/platform will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + for (i = 0; i < num; i++) { + dai_link[i].codecs = &dai_props[i].codecs; + dai_link[i].num_codecs = 1; + dai_link[i].platform = &dai_props[i].platform; + } + priv->dai_props = dai_props; priv->dai_link = dai_link; diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c index d3f3f0fec74c..f34cc6cddfa2 100644 --- a/sound/soc/generic/simple-card-utils.c +++ b/sound/soc/generic/simple-card-utils.c @@ -173,11 +173,23 @@ int asoc_simple_card_parse_clk(struct device *dev, struct device_node *node, struct device_node *dai_of_node, struct asoc_simple_dai *simple_dai, - const char *name) + const char *dai_name, + struct snd_soc_dai_link_component *dlc) { struct clk *clk; u32 val; + /* + * Use snd_soc_dai_link_component instead of legacy style. + * It is only for codec, but cpu will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + if (dlc) { + dai_of_node = dlc->of_node; + dai_name = dlc->dai_name; + } + /* * Parse dai->sysclk come from "clocks = <&xxx>" * (if system has common clock) @@ -200,7 +212,7 @@ int asoc_simple_card_parse_clk(struct device *dev, if (of_property_read_bool(node, "system-clock-direction-out")) simple_dai->clk_direction = SND_SOC_CLOCK_OUT; - dev_dbg(dev, "%s : sysclk = %d, direction %d\n", name, + dev_dbg(dev, "%s : sysclk = %d, direction %d\n", dai_name, simple_dai->sysclk, simple_dai->clk_direction); return 0; @@ -208,6 +220,7 @@ int asoc_simple_card_parse_clk(struct device *dev, EXPORT_SYMBOL_GPL(asoc_simple_card_parse_clk); int asoc_simple_card_parse_dai(struct device_node *node, + struct snd_soc_dai_link_component *dlc, struct device_node **dai_of_node, const char **dai_name, const char *list_name, @@ -220,6 +233,17 @@ int asoc_simple_card_parse_dai(struct device_node *node, if (!node) return 0; + /* + * Use snd_soc_dai_link_component instead of legacy style. + * It is only for codec, but cpu will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + if (dlc) { + dai_name = &dlc->dai_name; + dai_of_node = &dlc->of_node; + } + /* * Get node via "sound-dai = <&phandle port>" * it will be used as xxx_of_node on soc_bind_dai_link() @@ -278,6 +302,7 @@ static int asoc_simple_card_get_dai_id(struct device_node *ep) } int asoc_simple_card_parse_graph_dai(struct device_node *ep, + struct snd_soc_dai_link_component *dlc, struct device_node **dai_of_node, const char **dai_name) { @@ -285,6 +310,17 @@ int asoc_simple_card_parse_graph_dai(struct device_node *ep, struct of_phandle_args args; int ret; + /* + * Use snd_soc_dai_link_component instead of legacy style. + * It is only for codec, but cpu will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + if (dlc) { + dai_name = &dlc->dai_name; + dai_of_node = &dlc->of_node; + } + if (!ep) return 0; if (!dai_name) @@ -340,10 +376,11 @@ EXPORT_SYMBOL_GPL(asoc_simple_card_init_dai); int asoc_simple_card_canonicalize_dailink(struct snd_soc_dai_link *dai_link) { /* Assumes platform == cpu */ - if (!dai_link->platform_of_node) - dai_link->platform_of_node = dai_link->cpu_of_node; + if (!dai_link->platform->of_node) + dai_link->platform->of_node = dai_link->cpu_of_node; return 0; + } EXPORT_SYMBOL_GPL(asoc_simple_card_canonicalize_dailink); @@ -367,13 +404,11 @@ EXPORT_SYMBOL_GPL(asoc_simple_card_canonicalize_cpu); int asoc_simple_card_clean_reference(struct snd_soc_card *card) { struct snd_soc_dai_link *dai_link; - int num_links; + int i; - for (num_links = 0, dai_link = card->dai_link; - num_links < card->num_links; - num_links++, dai_link++) { + for_each_card_prelinks(card, i, dai_link) { of_node_put(dai_link->cpu_of_node); - of_node_put(dai_link->codec_of_node); + of_node_put(dai_link->codecs->of_node); } return 0; } diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index 64bf3560c1d1..5a3f59aa4ba5 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -20,6 +20,8 @@ struct simple_card_data { struct simple_dai_props { struct asoc_simple_dai cpu_dai; struct asoc_simple_dai codec_dai; + struct snd_soc_dai_link_component codecs; /* single codec */ + struct snd_soc_dai_link_component platform; unsigned int mclk_fs; } *dai_props; unsigned int mclk_fs; @@ -234,7 +236,7 @@ static int asoc_simple_card_dai_link_of(struct device_node *node, ret = asoc_simple_card_set_dailink_name(dev, dai_link, "%s-%s", dai_link->cpu_dai_name, - dai_link->codec_dai_name); + dai_link->codecs->dai_name); if (ret < 0) goto dai_link_of_err; @@ -363,7 +365,7 @@ static int asoc_simple_card_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; struct snd_soc_card *card; - int num, ret; + int num, ret, i; /* Get the number of DAI links */ if (np && of_get_child_by_name(np, PREFIX "dai-link")) @@ -381,6 +383,18 @@ static int asoc_simple_card_probe(struct platform_device *pdev) if (!dai_props || !dai_link) return -ENOMEM; + /* + * Use snd_soc_dai_link_component instead of legacy style + * It is codec only. but cpu/platform will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + for (i = 0; i < num; i++) { + dai_link[i].codecs = &dai_props[i].codecs; + dai_link[i].num_codecs = 1; + dai_link[i].platform = &dai_props[i].platform; + } + priv->dai_props = dai_props; priv->dai_link = dai_link; @@ -403,6 +417,8 @@ static int asoc_simple_card_probe(struct platform_device *pdev) } else { struct asoc_simple_card_info *cinfo; + struct snd_soc_dai_link_component *codecs; + struct snd_soc_dai_link_component *platform; cinfo = dev->platform_data; if (!cinfo) { @@ -419,13 +435,17 @@ static int asoc_simple_card_probe(struct platform_device *pdev) return -EINVAL; } + codecs = dai_link->codecs; + codecs->name = cinfo->codec; + codecs->dai_name = cinfo->codec_dai.name; + + platform = dai_link->platform; + platform->name = cinfo->platform; + card->name = (cinfo->card) ? cinfo->card : cinfo->name; dai_link->name = cinfo->name; dai_link->stream_name = cinfo->name; - dai_link->platform_name = cinfo->platform; - dai_link->codec_name = cinfo->codec; dai_link->cpu_dai_name = cinfo->cpu_dai.name; - dai_link->codec_dai_name = cinfo->codec_dai.name; dai_link->dai_fmt = cinfo->daifmt; dai_link->init = asoc_simple_card_dai_init; memcpy(&priv->dai_props->cpu_dai, &cinfo->cpu_dai, diff --git a/sound/soc/generic/simple-scu-card.c b/sound/soc/generic/simple-scu-card.c index 16a83bc51e0e..85b46f0eae0f 100644 --- a/sound/soc/generic/simple-scu-card.c +++ b/sound/soc/generic/simple-scu-card.c @@ -22,7 +22,11 @@ struct simple_card_data { struct snd_soc_card snd_card; struct snd_soc_codec_conf codec_conf; - struct asoc_simple_dai *dai_props; + struct simple_dai_props { + struct asoc_simple_dai dai; + struct snd_soc_dai_link_component codecs; + struct snd_soc_dai_link_component platform; + } *dai_props; struct snd_soc_dai_link *dai_link; struct asoc_simple_card_data adata; }; @@ -40,20 +44,20 @@ static int asoc_simple_card_startup(struct snd_pcm_substream *substream) { struct snd_soc_pcm_runtime *rtd = substream->private_data; struct simple_card_data *priv = snd_soc_card_get_drvdata(rtd->card); - struct asoc_simple_dai *dai_props = + struct simple_dai_props *dai_props = simple_priv_to_props(priv, rtd->num); - return asoc_simple_card_clk_enable(dai_props); + return asoc_simple_card_clk_enable(&dai_props->dai); } static void asoc_simple_card_shutdown(struct snd_pcm_substream *substream) { struct snd_soc_pcm_runtime *rtd = substream->private_data; struct simple_card_data *priv = snd_soc_card_get_drvdata(rtd->card); - struct asoc_simple_dai *dai_props = + struct simple_dai_props *dai_props = simple_priv_to_props(priv, rtd->num); - asoc_simple_card_clk_disable(dai_props); + asoc_simple_card_clk_disable(&dai_props->dai); } static const struct snd_soc_ops asoc_simple_card_ops = { @@ -66,7 +70,7 @@ static int asoc_simple_card_dai_init(struct snd_soc_pcm_runtime *rtd) struct simple_card_data *priv = snd_soc_card_get_drvdata(rtd->card); struct snd_soc_dai *dai; struct snd_soc_dai_link *dai_link; - struct asoc_simple_dai *dai_props; + struct simple_dai_props *dai_props; int num = rtd->num; dai_link = simple_priv_to_link(priv, num); @@ -75,7 +79,7 @@ static int asoc_simple_card_dai_init(struct snd_soc_pcm_runtime *rtd) rtd->cpu_dai : rtd->codec_dai; - return asoc_simple_card_init_dai(dai, dai_props); + return asoc_simple_card_init_dai(dai, &dai_props->dai); } static int asoc_simple_card_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, @@ -95,17 +99,19 @@ static int asoc_simple_card_dai_link_of(struct device_node *np, { struct device *dev = simple_priv_to_dev(priv); struct snd_soc_dai_link *dai_link = simple_priv_to_link(priv, idx); - struct asoc_simple_dai *dai_props = simple_priv_to_props(priv, idx); + struct simple_dai_props *dai_props = simple_priv_to_props(priv, idx); struct snd_soc_card *card = simple_priv_to_card(priv); int ret; if (is_fe) { int is_single_links = 0; + struct snd_soc_dai_link_component *codecs; /* BE is dummy */ - dai_link->codec_of_node = NULL; - dai_link->codec_dai_name = "snd-soc-dummy-dai"; - dai_link->codec_name = "snd-soc-dummy"; + codecs = dai_link->codecs; + codecs->of_node = NULL; + codecs->dai_name = "snd-soc-dummy-dai"; + codecs->name = "snd-soc-dummy"; /* FE settings */ dai_link->dynamic = 1; @@ -116,7 +122,7 @@ static int asoc_simple_card_dai_link_of(struct device_node *np, if (ret) return ret; - ret = asoc_simple_card_parse_clk_cpu(dev, np, dai_link, dai_props); + ret = asoc_simple_card_parse_clk_cpu(dev, np, dai_link, &dai_props->dai); if (ret < 0) return ret; @@ -141,23 +147,23 @@ static int asoc_simple_card_dai_link_of(struct device_node *np, if (ret < 0) return ret; - ret = asoc_simple_card_parse_clk_codec(dev, np, dai_link, dai_props); + ret = asoc_simple_card_parse_clk_codec(dev, np, dai_link, &dai_props->dai); if (ret < 0) return ret; ret = asoc_simple_card_set_dailink_name(dev, dai_link, "be.%s", - dai_link->codec_dai_name); + dai_link->codecs->dai_name); if (ret < 0) return ret; snd_soc_of_parse_audio_prefix(card, &priv->codec_conf, - dai_link->codec_of_node, + dai_link->codecs->of_node, PREFIX "prefix"); } - ret = asoc_simple_card_of_parse_tdm(np, dai_props); + ret = asoc_simple_card_of_parse_tdm(np, &dai_props->dai); if (ret) return ret; @@ -230,11 +236,11 @@ static int asoc_simple_card_probe(struct platform_device *pdev) { struct simple_card_data *priv; struct snd_soc_dai_link *dai_link; - struct asoc_simple_dai *dai_props; + struct simple_dai_props *dai_props; struct snd_soc_card *card; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; - int num, ret; + int num, ret, i; /* Allocate the private data */ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -248,6 +254,18 @@ static int asoc_simple_card_probe(struct platform_device *pdev) if (!dai_props || !dai_link) return -ENOMEM; + /* + * Use snd_soc_dai_link_component instead of legacy style + * It is codec only. but cpu/platform will be supported in the future. + * see + * soc-core.c :: snd_soc_init_multicodec() + */ + for (i = 0; i < num; i++) { + dai_link[i].codecs = &dai_props[i].codecs; + dai_link[i].num_codecs = 1; + dai_link[i].platform = &dai_props[i].platform; + } + priv->dai_props = dai_props; priv->dai_link = dai_link; diff --git a/sound/soc/hisilicon/hi6210-i2s.c b/sound/soc/hisilicon/hi6210-i2s.c index 53344a3b7a60..a69e5b11b3da 100644 --- a/sound/soc/hisilicon/hi6210-i2s.c +++ b/sound/soc/hisilicon/hi6210-i2s.c @@ -269,13 +269,13 @@ static int hi6210_i2s_hw_params(struct snd_pcm_substream *substream, switch (params_format(params)) { case SNDRV_PCM_FORMAT_U16_LE: signed_data = HII2S_I2S_CFG__S2_CODEC_DATA_FORMAT; - /* fallthru */ + /* fall through */ case SNDRV_PCM_FORMAT_S16_LE: bits = HII2S_BITS_16; break; case SNDRV_PCM_FORMAT_U24_LE: signed_data = HII2S_I2S_CFG__S2_CODEC_DATA_FORMAT; - /* fallthru */ + /* fall through */ case SNDRV_PCM_FORMAT_S24_LE: bits = HII2S_BITS_24; break; diff --git a/sound/soc/intel/atom/sst-mfld-platform-pcm.c b/sound/soc/intel/atom/sst-mfld-platform-pcm.c index 6c36da560877..afc559866095 100644 --- a/sound/soc/intel/atom/sst-mfld-platform-pcm.c +++ b/sound/soc/intel/atom/sst-mfld-platform-pcm.c @@ -765,7 +765,7 @@ static int sst_soc_prepare(struct device *dev) snd_soc_poweroff(drv->soc_card->dev); /* set the SSPs to idle */ - list_for_each_entry(rtd, &drv->soc_card->rtd_list, list) { + for_each_card_rtds(drv->soc_card, rtd) { struct snd_soc_dai *dai = rtd->cpu_dai; if (dai->active) { @@ -786,7 +786,7 @@ static void sst_soc_complete(struct device *dev) return; /* restart SSPs */ - list_for_each_entry(rtd, &drv->soc_card->rtd_list, list) { + for_each_card_rtds(drv->soc_card, rtd) { struct snd_soc_dai *dai = rtd->cpu_dai; if (dai->active) { diff --git a/sound/soc/intel/boards/Kconfig b/sound/soc/intel/boards/Kconfig index cccda87f4b34..73ca1350aa31 100644 --- a/sound/soc/intel/boards/Kconfig +++ b/sound/soc/intel/boards/Kconfig @@ -279,6 +279,28 @@ config SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH This adds support for ASoC Onboard Codec I2S machine driver. This will create an alsa sound card for DA7219 + MAX98357A I2S audio codec. Say Y if you have such a device. + +config SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH + tristate "KBL with DA7219 and MAX98927 in I2S Mode" + depends on MFD_INTEL_LPSS && I2C && ACPI + select SND_SOC_DA7219 + select SND_SOC_MAX98927 + select SND_SOC_DMIC + select SND_SOC_HDAC_HDMI + help + This adds support for ASoC Onboard Codec I2S machine driver. This will + create an alsa sound card for DA7219 + MAX98927 I2S audio codec. + Say Y if you have such a device. + If unsure select "N". + +config SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH + tristate "SKL/KBL/BXT/APL with HDA Codecs" + select SND_SOC_HDAC_HDMI + select SND_SOC_HDAC_HDA + help + This adds support for ASoC machine driver for Intel platforms + SKL/KBL/BXT/APL with iDisp, HDA audio codecs. + Say Y or m if you have such a device. This is a recommended option. If unsure select "N". config SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH diff --git a/sound/soc/intel/boards/Makefile b/sound/soc/intel/boards/Makefile index 87ef8b4058e5..5381e27df9cc 100644 --- a/sound/soc/intel/boards/Makefile +++ b/sound/soc/intel/boards/Makefile @@ -17,9 +17,11 @@ snd-soc-sst-byt-cht-da7213-objs := bytcht_da7213.o snd-soc-sst-byt-cht-es8316-objs := bytcht_es8316.o snd-soc-sst-byt-cht-nocodec-objs := bytcht_nocodec.o snd-soc-kbl_da7219_max98357a-objs := kbl_da7219_max98357a.o +snd-soc-kbl_da7219_max98927-objs := kbl_da7219_max98927.o snd-soc-kbl_rt5663_max98927-objs := kbl_rt5663_max98927.o snd-soc-kbl_rt5663_rt5514_max98927-objs := kbl_rt5663_rt5514_max98927.o snd-soc-skl_rt286-objs := skl_rt286.o +snd-soc-skl_hda_dsp-objs := skl_hda_dsp_generic.o skl_hda_dsp_common.o snd-skl_nau88l25_max98357a-objs := skl_nau88l25_max98357a.o snd-soc-skl_nau88l25_ssm4567-objs := skl_nau88l25_ssm4567.o @@ -41,8 +43,10 @@ obj-$(CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH) += snd-soc-sst-byt-cht-da7213.o obj-$(CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH) += snd-soc-sst-byt-cht-es8316.o obj-$(CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH) += snd-soc-sst-byt-cht-nocodec.o obj-$(CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH) += snd-soc-kbl_da7219_max98357a.o +obj-$(CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH) += snd-soc-kbl_da7219_max98927.o obj-$(CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH) += snd-soc-kbl_rt5663_max98927.o obj-$(CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH) += snd-soc-kbl_rt5663_rt5514_max98927.o obj-$(CONFIG_SND_SOC_INTEL_SKL_RT286_MACH) += snd-soc-skl_rt286.o obj-$(CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH) += snd-skl_nau88l25_max98357a.o obj-$(CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH) += snd-soc-skl_nau88l25_ssm4567.o +obj-$(CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH) += snd-soc-skl_hda_dsp.o diff --git a/sound/soc/intel/boards/broadwell.c b/sound/soc/intel/boards/broadwell.c index 7b0ee67b4fc8..68e6543e6cb0 100644 --- a/sound/soc/intel/boards/broadwell.c +++ b/sound/soc/intel/boards/broadwell.c @@ -223,7 +223,7 @@ static struct snd_soc_dai_link broadwell_rt286_dais[] = { static int broadwell_suspend(struct snd_soc_card *card){ struct snd_soc_component *component; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, "i2c-INT343A:00")) { dev_dbg(component->dev, "disabling jack detect before going to suspend.\n"); @@ -237,7 +237,7 @@ static int broadwell_suspend(struct snd_soc_card *card){ static int broadwell_resume(struct snd_soc_card *card){ struct snd_soc_component *component; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, "i2c-INT343A:00")) { dev_dbg(component->dev, "enabling jack detect for resume.\n"); diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index b6dc524830b2..8587bd3d1cc1 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1048,7 +1048,7 @@ static int byt_rt5640_suspend(struct snd_soc_card *card) if (!BYT_RT5640_JDSRC(byt_rt5640_quirk)) return 0; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, byt_rt5640_codec_name)) { dev_dbg(component->dev, "disabling jack detect before suspend\n"); snd_soc_component_set_jack(component, NULL, NULL); @@ -1067,7 +1067,7 @@ static int byt_rt5640_resume(struct snd_soc_card *card) if (!BYT_RT5640_JDSRC(byt_rt5640_quirk)) return 0; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, byt_rt5640_codec_name)) { dev_dbg(component->dev, "re-enabling jack detect after resume\n"); snd_soc_component_set_jack(component, &priv->jack, NULL); diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index f8a68bdb3885..c44298130720 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -742,7 +742,7 @@ static int byt_rt5651_suspend(struct snd_soc_card *card) if (!BYT_RT5651_JDSRC(byt_rt5651_quirk)) return 0; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, byt_rt5651_codec_name)) { dev_dbg(component->dev, "disabling jack detect before suspend\n"); snd_soc_component_set_jack(component, NULL, NULL); @@ -761,7 +761,7 @@ static int byt_rt5651_resume(struct snd_soc_card *card) if (!BYT_RT5651_JDSRC(byt_rt5651_quirk)) return 0; - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strcmp(component->name, byt_rt5651_codec_name)) { dev_dbg(component->dev, "re-enabling jack detect after resume\n"); snd_soc_component_set_jack(component, &priv->jack, NULL); @@ -787,7 +787,7 @@ static struct snd_soc_card byt_rt5651_card = { }; static const struct x86_cpu_id baytrail_cpu_ids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, /* Valleyview */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, /* Valleyview */ {} }; diff --git a/sound/soc/intel/boards/cht_bsw_rt5672.c b/sound/soc/intel/boards/cht_bsw_rt5672.c index e5aa13058dd7..51f0d45d6f8f 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5672.c +++ b/sound/soc/intel/boards/cht_bsw_rt5672.c @@ -16,6 +16,7 @@ * General Public License for more details. */ +#include #include #include #include @@ -212,6 +213,10 @@ static int cht_codec_init(struct snd_soc_pcm_runtime *runtime) if (ret) return ret; + snd_jack_set_key(ctx->headset.jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); + snd_jack_set_key(ctx->headset.jack, SND_JACK_BTN_1, KEY_VOLUMEUP); + snd_jack_set_key(ctx->headset.jack, SND_JACK_BTN_2, KEY_VOLUMEDOWN); + rt5670_set_jack_detect(component, &ctx->headset); if (ctx->mclk) { /* @@ -342,7 +347,7 @@ static int cht_suspend_pre(struct snd_soc_card *card) struct snd_soc_component *component; struct cht_mc_private *ctx = snd_soc_card_get_drvdata(card); - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strncmp(component->name, ctx->codec_name, sizeof(ctx->codec_name))) { @@ -359,7 +364,7 @@ static int cht_resume_post(struct snd_soc_card *card) struct snd_soc_component *component; struct cht_mc_private *ctx = snd_soc_card_get_drvdata(card); - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (!strncmp(component->name, ctx->codec_name, sizeof(ctx->codec_name))) { diff --git a/sound/soc/intel/boards/kbl_da7219_max98927.c b/sound/soc/intel/boards/kbl_da7219_max98927.c new file mode 100644 index 000000000000..3fa1c3ca6d37 --- /dev/null +++ b/sound/soc/intel/boards/kbl_da7219_max98927.c @@ -0,0 +1,983 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. + +/* + * Intel Kabylake I2S Machine Driver with MAX98927 & DA7219 Codecs + * + * Modified from: + * Intel Kabylake I2S Machine driver supporting MAX98927 and + * RT5663 codecs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../codecs/da7219.h" +#include "../../codecs/hdac_hdmi.h" +#include "../skylake/skl.h" +#include "../../codecs/da7219-aad.h" + +#define KBL_DIALOG_CODEC_DAI "da7219-hifi" +#define MAX98927_CODEC_DAI "max98927-aif1" +#define MAXIM_DEV0_NAME "i2c-MX98927:00" +#define MAXIM_DEV1_NAME "i2c-MX98927:01" +#define DUAL_CHANNEL 2 +#define QUAD_CHANNEL 4 +#define NAME_SIZE 32 + +static struct snd_soc_card *kabylake_audio_card; +static struct snd_soc_jack kabylake_hdmi[3]; + +struct kbl_hdmi_pcm { + struct list_head head; + struct snd_soc_dai *codec_dai; + int device; +}; + +struct kbl_codec_private { + struct snd_soc_jack kabylake_headset; + struct list_head hdmi_pcm_list; +}; + +enum { + KBL_DPCM_AUDIO_PB = 0, + KBL_DPCM_AUDIO_CP, + KBL_DPCM_AUDIO_ECHO_REF_CP, + KBL_DPCM_AUDIO_REF_CP, + KBL_DPCM_AUDIO_DMIC_CP, + KBL_DPCM_AUDIO_HDMI1_PB, + KBL_DPCM_AUDIO_HDMI2_PB, + KBL_DPCM_AUDIO_HDMI3_PB, + KBL_DPCM_AUDIO_HS_PB, +}; + +static int platform_clock_control(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *k, int event) +{ + struct snd_soc_dapm_context *dapm = w->dapm; + struct snd_soc_card *card = dapm->card; + struct snd_soc_dai *codec_dai; + int ret = 0; + + codec_dai = snd_soc_card_get_codec_dai(card, KBL_DIALOG_CODEC_DAI); + if (!codec_dai) { + dev_err(card->dev, "Codec dai not found; Unable to set/unset codec pll\n"); + return -EIO; + } + + /* Configure sysclk for codec */ + ret = snd_soc_dai_set_sysclk(codec_dai, DA7219_CLKSRC_MCLK, 24576000, + SND_SOC_CLOCK_IN); + if (ret) { + dev_err(card->dev, "can't set codec sysclk configuration\n"); + return ret; + } + + if (SND_SOC_DAPM_EVENT_OFF(event)) { + ret = snd_soc_dai_set_pll(codec_dai, 0, + DA7219_SYSCLK_MCLK, 0, 0); + if (ret) + dev_err(card->dev, "failed to stop PLL: %d\n", ret); + } else if (SND_SOC_DAPM_EVENT_ON(event)) { + ret = snd_soc_dai_set_pll(codec_dai, 0, DA7219_SYSCLK_PLL_SRM, + 0, DA7219_PLL_FREQ_OUT_98304); + if (ret) + dev_err(card->dev, "failed to start PLL: %d\n", ret); + } + + return ret; +} + +static const struct snd_kcontrol_new kabylake_controls[] = { + SOC_DAPM_PIN_SWITCH("Headphone Jack"), + SOC_DAPM_PIN_SWITCH("Headset Mic"), + SOC_DAPM_PIN_SWITCH("Left Spk"), + SOC_DAPM_PIN_SWITCH("Right Spk"), +}; + +static const struct snd_soc_dapm_widget kabylake_widgets[] = { + SND_SOC_DAPM_HP("Headphone Jack", NULL), + SND_SOC_DAPM_MIC("Headset Mic", NULL), + SND_SOC_DAPM_SPK("Left Spk", NULL), + SND_SOC_DAPM_SPK("Right Spk", NULL), + SND_SOC_DAPM_MIC("SoC DMIC", NULL), + SND_SOC_DAPM_SPK("DP", NULL), + SND_SOC_DAPM_SPK("HDMI", NULL), + SND_SOC_DAPM_SUPPLY("Platform Clock", SND_SOC_NOPM, 0, 0, + platform_clock_control, SND_SOC_DAPM_PRE_PMU | + SND_SOC_DAPM_POST_PMD), +}; + +static const struct snd_soc_dapm_route kabylake_map[] = { + /* speaker */ + { "Left Spk", NULL, "Left BE_OUT" }, + { "Right Spk", NULL, "Right BE_OUT" }, + + /* other jacks */ + { "DMic", NULL, "SoC DMIC" }, + + { "HDMI", NULL, "hif5 Output" }, + { "DP", NULL, "hif6 Output" }, + + /* CODEC BE connections */ + { "Left HiFi Playback", NULL, "ssp0 Tx" }, + { "Right HiFi Playback", NULL, "ssp0 Tx" }, + { "ssp0 Tx", NULL, "spk_out" }, + + /* IV feedback path */ + { "codec0_fb_in", NULL, "ssp0 Rx"}, + { "ssp0 Rx", NULL, "Left HiFi Capture" }, + { "ssp0 Rx", NULL, "Right HiFi Capture" }, + + /* AEC capture path */ + { "echo_ref_out", NULL, "ssp0 Rx" }, + + /* DMIC */ + { "dmic01_hifi", NULL, "DMIC01 Rx" }, + { "DMIC01 Rx", NULL, "DMIC AIF" }, + + { "hifi1", NULL, "iDisp1 Tx" }, + { "iDisp1 Tx", NULL, "iDisp1_out" }, + { "hifi2", NULL, "iDisp2 Tx" }, + { "iDisp2 Tx", NULL, "iDisp2_out" }, + { "hifi3", NULL, "iDisp3 Tx"}, + { "iDisp3 Tx", NULL, "iDisp3_out"}, +}; + +static const struct snd_soc_dapm_route kabylake_ssp1_map[] = { + { "Headphone Jack", NULL, "HPL" }, + { "Headphone Jack", NULL, "HPR" }, + + /* other jacks */ + { "MIC", NULL, "Headset Mic" }, + + /* CODEC BE connections */ + { "Playback", NULL, "ssp1 Tx" }, + { "ssp1 Tx", NULL, "codec1_out" }, + + { "hs_in", NULL, "ssp1 Rx" }, + { "ssp1 Rx", NULL, "Capture" }, + + { "Headphone Jack", NULL, "Platform Clock" }, + { "Headset Mic", NULL, "Platform Clock" }, +}; + +static int kabylake_ssp0_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params) +{ + struct snd_soc_pcm_runtime *runtime = substream->private_data; + int ret = 0, j; + + for (j = 0; j < runtime->num_codecs; j++) { + struct snd_soc_dai *codec_dai = runtime->codec_dais[j]; + + if (!strcmp(codec_dai->component->name, MAXIM_DEV0_NAME)) { + ret = snd_soc_dai_set_tdm_slot(codec_dai, 0x30, 3, 8, 16); + if (ret < 0) { + dev_err(runtime->dev, "DEV0 TDM slot err:%d\n", ret); + return ret; + } + } + if (!strcmp(codec_dai->component->name, MAXIM_DEV1_NAME)) { + ret = snd_soc_dai_set_tdm_slot(codec_dai, 0xC0, 3, 8, 16); + if (ret < 0) { + dev_err(runtime->dev, "DEV1 TDM slot err:%d\n", ret); + return ret; + } + } + } + + return 0; +} + +static struct snd_soc_ops kabylake_ssp0_ops = { + .hw_params = kabylake_ssp0_hw_params, +}; + +static int kabylake_ssp_fixup(struct snd_soc_pcm_runtime *rtd, + struct snd_pcm_hw_params *params) +{ + struct snd_interval *rate = hw_param_interval(params, + SNDRV_PCM_HW_PARAM_RATE); + struct snd_interval *channels = hw_param_interval(params, + SNDRV_PCM_HW_PARAM_CHANNELS); + struct snd_mask *fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); + struct snd_soc_dpcm *dpcm = container_of( + params, struct snd_soc_dpcm, hw_params); + struct snd_soc_dai_link *fe_dai_link = dpcm->fe->dai_link; + struct snd_soc_dai_link *be_dai_link = dpcm->be->dai_link; + + /* + * The ADSP will convert the FE rate to 48k, stereo, 24 bit + */ + if (!strcmp(fe_dai_link->name, "Kbl Audio Port") || + !strcmp(fe_dai_link->name, "Kbl Audio Headset Playback") || + !strcmp(fe_dai_link->name, "Kbl Audio Capture Port")) { + rate->min = rate->max = 48000; + channels->min = channels->max = 2; + snd_mask_none(fmt); + snd_mask_set(fmt, SNDRV_PCM_FORMAT_S24_LE); + } + + /* + * The speaker on the SSP0 supports S16_LE and not S24_LE. + * thus changing the mask here + */ + if (!strcmp(be_dai_link->name, "SSP0-Codec")) + snd_mask_set(fmt, SNDRV_PCM_FORMAT_S16_LE); + + return 0; +} + +static int kabylake_da7219_codec_init(struct snd_soc_pcm_runtime *rtd) +{ + struct kbl_codec_private *ctx = snd_soc_card_get_drvdata(rtd->card); + struct snd_soc_component *component = rtd->codec_dai->component; + struct snd_soc_jack *jack; + struct snd_soc_card *card = rtd->card; + int ret; + + + ret = snd_soc_dapm_add_routes(&card->dapm, + kabylake_ssp1_map, + ARRAY_SIZE(kabylake_ssp1_map)); + + /* + * Headset buttons map to the google Reference headset. + * These can be configured by userspace. + */ + ret = snd_soc_card_jack_new(kabylake_audio_card, "Headset Jack", + SND_JACK_HEADSET | SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_LINEOUT, + &ctx->kabylake_headset, NULL, 0); + if (ret) { + dev_err(rtd->dev, "Headset Jack creation failed: %d\n", ret); + return ret; + } + + jack = &ctx->kabylake_headset; + snd_jack_set_key(jack->jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); + snd_jack_set_key(jack->jack, SND_JACK_BTN_1, KEY_VOICECOMMAND); + snd_jack_set_key(jack->jack, SND_JACK_BTN_2, KEY_VOLUMEUP); + snd_jack_set_key(jack->jack, SND_JACK_BTN_3, KEY_VOLUMEDOWN); + + da7219_aad_jack_det(component, &ctx->kabylake_headset); + + ret = snd_soc_dapm_ignore_suspend(&rtd->card->dapm, "SoC DMIC"); + if (ret) + dev_err(rtd->dev, "SoC DMIC - Ignore suspend failed %d\n", ret); + + return ret; +} + +static int kabylake_hdmi_init(struct snd_soc_pcm_runtime *rtd, int device) +{ + struct kbl_codec_private *ctx = snd_soc_card_get_drvdata(rtd->card); + struct snd_soc_dai *dai = rtd->codec_dai; + struct kbl_hdmi_pcm *pcm; + + pcm = devm_kzalloc(rtd->card->dev, sizeof(*pcm), GFP_KERNEL); + if (!pcm) + return -ENOMEM; + + pcm->device = device; + pcm->codec_dai = dai; + + list_add_tail(&pcm->head, &ctx->hdmi_pcm_list); + + return 0; +} + +static int kabylake_hdmi1_init(struct snd_soc_pcm_runtime *rtd) +{ + return kabylake_hdmi_init(rtd, KBL_DPCM_AUDIO_HDMI1_PB); +} + +static int kabylake_hdmi2_init(struct snd_soc_pcm_runtime *rtd) +{ + return kabylake_hdmi_init(rtd, KBL_DPCM_AUDIO_HDMI2_PB); +} + +static int kabylake_hdmi3_init(struct snd_soc_pcm_runtime *rtd) +{ + return kabylake_hdmi_init(rtd, KBL_DPCM_AUDIO_HDMI3_PB); +} + +static int kabylake_da7219_fe_init(struct snd_soc_pcm_runtime *rtd) +{ + struct snd_soc_dapm_context *dapm; + struct snd_soc_component *component = rtd->cpu_dai->component; + + dapm = snd_soc_component_get_dapm(component); + snd_soc_dapm_ignore_suspend(dapm, "Reference Capture"); + + return 0; +} + +static const unsigned int rates[] = { + 48000, +}; + +static const struct snd_pcm_hw_constraint_list constraints_rates = { + .count = ARRAY_SIZE(rates), + .list = rates, + .mask = 0, +}; + +static const unsigned int channels[] = { + DUAL_CHANNEL, +}; + +static const struct snd_pcm_hw_constraint_list constraints_channels = { + .count = ARRAY_SIZE(channels), + .list = channels, + .mask = 0, +}; + +static unsigned int channels_quad[] = { + QUAD_CHANNEL, +}; + +static struct snd_pcm_hw_constraint_list constraints_channels_quad = { + .count = ARRAY_SIZE(channels_quad), + .list = channels_quad, + .mask = 0, +}; + +static int kbl_fe_startup(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + + /* + * On this platform for PCM device we support, + * 48Khz + * stereo + * 16 bit audio + */ + + runtime->hw.channels_max = DUAL_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels); + + runtime->hw.formats = SNDRV_PCM_FMTBIT_S16_LE; + snd_pcm_hw_constraint_msbits(runtime, 0, 16, 16); + + snd_pcm_hw_constraint_list(runtime, 0, + SNDRV_PCM_HW_PARAM_RATE, &constraints_rates); + + return 0; +} + +static const struct snd_soc_ops kabylake_da7219_fe_ops = { + .startup = kbl_fe_startup, +}; + +static int kabylake_dmic_fixup(struct snd_soc_pcm_runtime *rtd, + struct snd_pcm_hw_params *params) +{ + struct snd_interval *channels = hw_param_interval(params, + SNDRV_PCM_HW_PARAM_CHANNELS); + + /* + * set BE channel constraint as user FE channels + */ + + if (params_channels(params) == 2) + channels->min = channels->max = 2; + else + channels->min = channels->max = 4; + + return 0; +} + +static int kabylake_dmic_startup(struct snd_pcm_substream *substream) +{ + struct snd_pcm_runtime *runtime = substream->runtime; + + runtime->hw.channels_min = runtime->hw.channels_max = QUAD_CHANNEL; + snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_channels_quad); + + return snd_pcm_hw_constraint_list(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_RATE, &constraints_rates); +} + +static struct snd_soc_ops kabylake_dmic_ops = { + .startup = kabylake_dmic_startup, +}; + +static const unsigned int rates_16000[] = { + 16000, +}; + +static const struct snd_pcm_hw_constraint_list constraints_16000 = { + .count = ARRAY_SIZE(rates_16000), + .list = rates_16000, +}; + +static const unsigned int ch_mono[] = { + 1, +}; +static const struct snd_pcm_hw_constraint_list constraints_refcap = { + .count = ARRAY_SIZE(ch_mono), + .list = ch_mono, +}; + +static int kabylake_refcap_startup(struct snd_pcm_substream *substream) +{ + substream->runtime->hw.channels_max = 1; + snd_pcm_hw_constraint_list(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_CHANNELS, + &constraints_refcap); + + return snd_pcm_hw_constraint_list(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_RATE, + &constraints_16000); +} + + +static struct snd_soc_ops skylaye_refcap_ops = { + .startup = kabylake_refcap_startup, +}; + +static struct snd_soc_codec_conf max98927_codec_conf[] = { + + { + .dev_name = MAXIM_DEV0_NAME, + .name_prefix = "Right", + }, + + { + .dev_name = MAXIM_DEV1_NAME, + .name_prefix = "Left", + }, +}; + +static struct snd_soc_dai_link_component ssp0_codec_components[] = { + { /* Left */ + .name = MAXIM_DEV0_NAME, + .dai_name = MAX98927_CODEC_DAI, + }, + + { /* For Right */ + .name = MAXIM_DEV1_NAME, + .dai_name = MAX98927_CODEC_DAI, + }, + +}; + +/* kabylake digital audio interface glue - connects codec <--> CPU */ +static struct snd_soc_dai_link kabylake_dais[] = { + /* Front End DAI links */ + [KBL_DPCM_AUDIO_PB] = { + .name = "Kbl Audio Port", + .stream_name = "Audio", + .cpu_dai_name = "System Pin", + .platform_name = "0000:00:1f.3", + .dynamic = 1, + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .nonatomic = 1, + .init = kabylake_da7219_fe_init, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_playback = 1, + .ops = &kabylake_da7219_fe_ops, + }, + [KBL_DPCM_AUDIO_CP] = { + .name = "Kbl Audio Capture Port", + .stream_name = "Audio Record", + .cpu_dai_name = "System Pin", + .platform_name = "0000:00:1f.3", + .dynamic = 1, + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .nonatomic = 1, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_capture = 1, + .ops = &kabylake_da7219_fe_ops, + }, + [KBL_DPCM_AUDIO_ECHO_REF_CP] = { + .name = "Kbl Audio Echo Reference cap", + .stream_name = "Echoreference Capture", + .cpu_dai_name = "Echoref Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .capture_only = 1, + .nonatomic = 1, + }, + [KBL_DPCM_AUDIO_REF_CP] = { + .name = "Kbl Audio Reference cap", + .stream_name = "Wake on Voice", + .cpu_dai_name = "Reference Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .dpcm_capture = 1, + .nonatomic = 1, + .dynamic = 1, + .ops = &skylaye_refcap_ops, + }, + [KBL_DPCM_AUDIO_DMIC_CP] = { + .name = "Kbl Audio DMIC cap", + .stream_name = "dmiccap", + .cpu_dai_name = "DMIC Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .dpcm_capture = 1, + .nonatomic = 1, + .dynamic = 1, + .ops = &kabylake_dmic_ops, + }, + [KBL_DPCM_AUDIO_HDMI1_PB] = { + .name = "Kbl HDMI Port1", + .stream_name = "Hdmi1", + .cpu_dai_name = "HDMI1 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = NULL, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .nonatomic = 1, + .dynamic = 1, + }, + [KBL_DPCM_AUDIO_HDMI2_PB] = { + .name = "Kbl HDMI Port2", + .stream_name = "Hdmi2", + .cpu_dai_name = "HDMI2 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = NULL, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .nonatomic = 1, + .dynamic = 1, + }, + [KBL_DPCM_AUDIO_HDMI3_PB] = { + .name = "Kbl HDMI Port3", + .stream_name = "Hdmi3", + .cpu_dai_name = "HDMI3 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_playback = 1, + .init = NULL, + .nonatomic = 1, + .dynamic = 1, + }, + [KBL_DPCM_AUDIO_HS_PB] = { + .name = "Kbl Audio Headset Playback", + .stream_name = "Headset Audio", + .cpu_dai_name = "System Pin2", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .nonatomic = 1, + .dynamic = 1, + .init = kabylake_da7219_fe_init, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .ops = &kabylake_da7219_fe_ops, + + }, + + /* Back End DAI links */ + { + /* SSP0 - Codec */ + .name = "SSP0-Codec", + .id = 0, + .cpu_dai_name = "SSP0 Pin", + .platform_name = "0000:00:1f.3", + .no_pcm = 1, + .codecs = ssp0_codec_components, + .num_codecs = ARRAY_SIZE(ssp0_codec_components), + .dai_fmt = SND_SOC_DAIFMT_DSP_B | + SND_SOC_DAIFMT_NB_NF | + SND_SOC_DAIFMT_CBS_CFS, + .dpcm_playback = 1, + .dpcm_capture = 1, + .ignore_pmdown_time = 1, + .be_hw_params_fixup = kabylake_ssp_fixup, + .ops = &kabylake_ssp0_ops, + }, + { + /* SSP1 - Codec */ + .name = "SSP1-Codec", + .id = 1, + .cpu_dai_name = "SSP1 Pin", + .platform_name = "0000:00:1f.3", + .no_pcm = 1, + .codec_name = "i2c-DLGS7219:00", + .codec_dai_name = KBL_DIALOG_CODEC_DAI, + .init = kabylake_da7219_codec_init, + .dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF | + SND_SOC_DAIFMT_CBS_CFS, + .ignore_pmdown_time = 1, + .be_hw_params_fixup = kabylake_ssp_fixup, + .dpcm_playback = 1, + .dpcm_capture = 1, + }, + { + .name = "dmic01", + .id = 2, + .cpu_dai_name = "DMIC01 Pin", + .codec_name = "dmic-codec", + .codec_dai_name = "dmic-hifi", + .platform_name = "0000:00:1f.3", + .be_hw_params_fixup = kabylake_dmic_fixup, + .ignore_suspend = 1, + .dpcm_capture = 1, + .no_pcm = 1, + }, + { + .name = "iDisp1", + .id = 3, + .cpu_dai_name = "iDisp1 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi1", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = kabylake_hdmi1_init, + .no_pcm = 1, + }, + { + .name = "iDisp2", + .id = 4, + .cpu_dai_name = "iDisp2 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi2", + .platform_name = "0000:00:1f.3", + .init = kabylake_hdmi2_init, + .dpcm_playback = 1, + .no_pcm = 1, + }, + { + .name = "iDisp3", + .id = 5, + .cpu_dai_name = "iDisp3 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi3", + .platform_name = "0000:00:1f.3", + .init = kabylake_hdmi3_init, + .dpcm_playback = 1, + .no_pcm = 1, + }, +}; + +/* kabylake digital audio interface glue - connects codec <--> CPU */ +static struct snd_soc_dai_link kabylake_max98927_dais[] = { + /* Front End DAI links */ + [KBL_DPCM_AUDIO_PB] = { + .name = "Kbl Audio Port", + .stream_name = "Audio", + .cpu_dai_name = "System Pin", + .platform_name = "0000:00:1f.3", + .dynamic = 1, + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .nonatomic = 1, + .init = kabylake_da7219_fe_init, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_playback = 1, + .ops = &kabylake_da7219_fe_ops, + }, + [KBL_DPCM_AUDIO_CP] = { + .name = "Kbl Audio Capture Port", + .stream_name = "Audio Record", + .cpu_dai_name = "System Pin", + .platform_name = "0000:00:1f.3", + .dynamic = 1, + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .nonatomic = 1, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_capture = 1, + .ops = &kabylake_da7219_fe_ops, + }, + [KBL_DPCM_AUDIO_ECHO_REF_CP] = { + .name = "Kbl Audio Echo Reference cap", + .stream_name = "Echoreference Capture", + .cpu_dai_name = "Echoref Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .capture_only = 1, + .nonatomic = 1, + }, + [KBL_DPCM_AUDIO_REF_CP] = { + .name = "Kbl Audio Reference cap", + .stream_name = "Wake on Voice", + .cpu_dai_name = "Reference Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .dpcm_capture = 1, + .nonatomic = 1, + .dynamic = 1, + .ops = &skylaye_refcap_ops, + }, + [KBL_DPCM_AUDIO_DMIC_CP] = { + .name = "Kbl Audio DMIC cap", + .stream_name = "dmiccap", + .cpu_dai_name = "DMIC Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .init = NULL, + .dpcm_capture = 1, + .nonatomic = 1, + .dynamic = 1, + .ops = &kabylake_dmic_ops, + }, + [KBL_DPCM_AUDIO_HDMI1_PB] = { + .name = "Kbl HDMI Port1", + .stream_name = "Hdmi1", + .cpu_dai_name = "HDMI1 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = NULL, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .nonatomic = 1, + .dynamic = 1, + }, + [KBL_DPCM_AUDIO_HDMI2_PB] = { + .name = "Kbl HDMI Port2", + .stream_name = "Hdmi2", + .cpu_dai_name = "HDMI2 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = NULL, + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .nonatomic = 1, + .dynamic = 1, + }, + [KBL_DPCM_AUDIO_HDMI3_PB] = { + .name = "Kbl HDMI Port3", + .stream_name = "Hdmi3", + .cpu_dai_name = "HDMI3 Pin", + .codec_name = "snd-soc-dummy", + .codec_dai_name = "snd-soc-dummy-dai", + .platform_name = "0000:00:1f.3", + .trigger = { + SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST}, + .dpcm_playback = 1, + .init = NULL, + .nonatomic = 1, + .dynamic = 1, + }, + + /* Back End DAI links */ + { + /* SSP0 - Codec */ + .name = "SSP0-Codec", + .id = 0, + .cpu_dai_name = "SSP0 Pin", + .platform_name = "0000:00:1f.3", + .no_pcm = 1, + .codecs = ssp0_codec_components, + .num_codecs = ARRAY_SIZE(ssp0_codec_components), + .dai_fmt = SND_SOC_DAIFMT_DSP_B | + SND_SOC_DAIFMT_NB_NF | + SND_SOC_DAIFMT_CBS_CFS, + .dpcm_playback = 1, + .dpcm_capture = 1, + .ignore_pmdown_time = 1, + .be_hw_params_fixup = kabylake_ssp_fixup, + .ops = &kabylake_ssp0_ops, + }, + { + .name = "dmic01", + .id = 1, + .cpu_dai_name = "DMIC01 Pin", + .codec_name = "dmic-codec", + .codec_dai_name = "dmic-hifi", + .platform_name = "0000:00:1f.3", + .be_hw_params_fixup = kabylake_dmic_fixup, + .ignore_suspend = 1, + .dpcm_capture = 1, + .no_pcm = 1, + }, + { + .name = "iDisp1", + .id = 2, + .cpu_dai_name = "iDisp1 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi1", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .init = kabylake_hdmi1_init, + .no_pcm = 1, + }, + { + .name = "iDisp2", + .id = 3, + .cpu_dai_name = "iDisp2 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi2", + .platform_name = "0000:00:1f.3", + .init = kabylake_hdmi2_init, + .dpcm_playback = 1, + .no_pcm = 1, + }, + { + .name = "iDisp3", + .id = 4, + .cpu_dai_name = "iDisp3 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi3", + .platform_name = "0000:00:1f.3", + .init = kabylake_hdmi3_init, + .dpcm_playback = 1, + .no_pcm = 1, + }, +}; + +static int kabylake_card_late_probe(struct snd_soc_card *card) +{ + struct kbl_codec_private *ctx = snd_soc_card_get_drvdata(card); + struct kbl_hdmi_pcm *pcm; + struct snd_soc_component *component = NULL; + int err, i = 0; + char jack_name[NAME_SIZE]; + + list_for_each_entry(pcm, &ctx->hdmi_pcm_list, head) { + component = pcm->codec_dai->component; + snprintf(jack_name, sizeof(jack_name), + "HDMI/DP, pcm=%d Jack", pcm->device); + err = snd_soc_card_jack_new(card, jack_name, + SND_JACK_AVOUT, &kabylake_hdmi[i], + NULL, 0); + + if (err) + return err; + + err = hdac_hdmi_jack_init(pcm->codec_dai, pcm->device, + &kabylake_hdmi[i]); + if (err < 0) + return err; + + i++; + } + + if (!component) + return -EINVAL; + + return hdac_hdmi_jack_port_init(component, &card->dapm); + + return 0; +} + +/* kabylake audio machine driver for SPT + DA7219 */ +static struct snd_soc_card kbl_audio_card_da7219_m98927 = { + .name = "kblda7219m98927", + .owner = THIS_MODULE, + .dai_link = kabylake_dais, + .num_links = ARRAY_SIZE(kabylake_dais), + .controls = kabylake_controls, + .num_controls = ARRAY_SIZE(kabylake_controls), + .dapm_widgets = kabylake_widgets, + .num_dapm_widgets = ARRAY_SIZE(kabylake_widgets), + .dapm_routes = kabylake_map, + .num_dapm_routes = ARRAY_SIZE(kabylake_map), + .codec_conf = max98927_codec_conf, + .num_configs = ARRAY_SIZE(max98927_codec_conf), + .fully_routed = true, + .late_probe = kabylake_card_late_probe, +}; + +/* kabylake audio machine driver for Maxim98927 */ +static struct snd_soc_card kbl_audio_card_max98927 = { + .name = "kblmax98927", + .owner = THIS_MODULE, + .dai_link = kabylake_max98927_dais, + .num_links = ARRAY_SIZE(kabylake_max98927_dais), + .controls = kabylake_controls, + .num_controls = ARRAY_SIZE(kabylake_controls), + .dapm_widgets = kabylake_widgets, + .num_dapm_widgets = ARRAY_SIZE(kabylake_widgets), + .dapm_routes = kabylake_map, + .num_dapm_routes = ARRAY_SIZE(kabylake_map), + .codec_conf = max98927_codec_conf, + .num_configs = ARRAY_SIZE(max98927_codec_conf), + .fully_routed = true, + .late_probe = kabylake_card_late_probe, +}; + +static int kabylake_audio_probe(struct platform_device *pdev) +{ + struct kbl_codec_private *ctx; + + ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx), GFP_ATOMIC); + if (!ctx) + return -ENOMEM; + + INIT_LIST_HEAD(&ctx->hdmi_pcm_list); + + kabylake_audio_card = + (struct snd_soc_card *)pdev->id_entry->driver_data; + + kabylake_audio_card->dev = &pdev->dev; + snd_soc_card_set_drvdata(kabylake_audio_card, ctx); + + return devm_snd_soc_register_card(&pdev->dev, kabylake_audio_card); +} + +static const struct platform_device_id kbl_board_ids[] = { + { + .name = "kbl_da7219_max98927", + .driver_data = + (kernel_ulong_t)&kbl_audio_card_da7219_m98927, + }, + { + .name = "kbl_max98927", + .driver_data = + (kernel_ulong_t)&kbl_audio_card_max98927, + }, + { } +}; + +static struct platform_driver kabylake_audio = { + .probe = kabylake_audio_probe, + .driver = { + .name = "kbl_da7219_max98927", + .pm = &snd_soc_pm_ops, + }, + .id_table = kbl_board_ids, +}; + +module_platform_driver(kabylake_audio) + +/* Module information */ +MODULE_DESCRIPTION("Audio KabyLake Machine driver for MAX98927 & DA7219"); +MODULE_AUTHOR("Mac Chiang "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:kbl_da7219_max98927"); +MODULE_ALIAS("platform:kbl_max98927"); diff --git a/sound/soc/intel/boards/kbl_rt5663_max98927.c b/sound/soc/intel/boards/kbl_rt5663_max98927.c index 21a6490746a6..99e1320c485f 100644 --- a/sound/soc/intel/boards/kbl_rt5663_max98927.c +++ b/sound/soc/intel/boards/kbl_rt5663_max98927.c @@ -488,11 +488,10 @@ static int kabylake_ssp0_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_dai *codec_dai; int ret = 0, j; - for (j = 0; j < rtd->num_codecs; j++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[j]; - + for_each_rtd_codec_dai(rtd, j, codec_dai) { if (!strcmp(codec_dai->component->name, MAXIM_DEV0_NAME)) { /* * Use channel 4 and 5 for the first amp diff --git a/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c b/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c index a892b37eab7c..a737c915d46a 100644 --- a/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c +++ b/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c @@ -353,11 +353,10 @@ static int kabylake_ssp0_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_dai *codec_dai; int ret = 0, j; - for (j = 0; j < rtd->num_codecs; j++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[j]; - + for_each_rtd_codec_dai(rtd, j, codec_dai) { if (!strcmp(codec_dai->component->name, RT5514_DEV_NAME)) { ret = snd_soc_dai_set_tdm_slot(codec_dai, 0xF, 0, 8, 16); if (ret < 0) { diff --git a/sound/soc/intel/boards/skl_hda_dsp_common.c b/sound/soc/intel/boards/skl_hda_dsp_common.c new file mode 100644 index 000000000000..3fdbf239da74 --- /dev/null +++ b/sound/soc/intel/boards/skl_hda_dsp_common.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2015-18 Intel Corporation. + +/* + * Common functions used in different Intel machine drivers + */ +#include +#include +#include +#include +#include +#include +#include +#include "../../codecs/hdac_hdmi.h" +#include "../skylake/skl.h" +#include "skl_hda_dsp_common.h" + +#define NAME_SIZE 32 + +int skl_hda_hdmi_add_pcm(struct snd_soc_card *card, int device) +{ + struct skl_hda_private *ctx = snd_soc_card_get_drvdata(card); + struct skl_hda_hdmi_pcm *pcm; + char dai_name[NAME_SIZE]; + + pcm = devm_kzalloc(card->dev, sizeof(*pcm), GFP_KERNEL); + if (!pcm) + return -ENOMEM; + + snprintf(dai_name, sizeof(dai_name), "intel-hdmi-hifi%d", + ctx->dai_index); + pcm->codec_dai = snd_soc_card_get_codec_dai(card, dai_name); + if (!pcm->codec_dai) + return -EINVAL; + + pcm->device = device; + list_add_tail(&pcm->head, &ctx->hdmi_pcm_list); + + return 0; +} + +/* skl_hda_digital audio interface glue - connects codec <--> CPU */ +struct snd_soc_dai_link skl_hda_be_dai_links[HDA_DSP_MAX_BE_DAI_LINKS] = { + /* Back End DAI links */ + { + .name = "iDisp1", + .id = 1, + .cpu_dai_name = "iDisp1 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi1", + .dpcm_playback = 1, + .no_pcm = 1, + }, + { + .name = "iDisp2", + .id = 2, + .cpu_dai_name = "iDisp2 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi2", + .dpcm_playback = 1, + .no_pcm = 1, + }, + { + .name = "iDisp3", + .id = 3, + .cpu_dai_name = "iDisp3 Pin", + .codec_name = "ehdaudio0D2", + .codec_dai_name = "intel-hdmi-hifi3", + .dpcm_playback = 1, + .no_pcm = 1, + }, + { + .name = "Analog Playback and Capture", + .id = 4, + .cpu_dai_name = "Analog CPU DAI", + .codec_name = "ehdaudio0D0", + .codec_dai_name = "Analog Codec DAI", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .dpcm_capture = 1, + .init = NULL, + .no_pcm = 1, + }, + { + .name = "Digital Playback and Capture", + .id = 5, + .cpu_dai_name = "Digital CPU DAI", + .codec_name = "ehdaudio0D0", + .codec_dai_name = "Digital Codec DAI", + .platform_name = "0000:00:1f.3", + .dpcm_playback = 1, + .dpcm_capture = 1, + .init = NULL, + .no_pcm = 1, + }, +}; + +int skl_hda_hdmi_jack_init(struct snd_soc_card *card) +{ + struct skl_hda_private *ctx = snd_soc_card_get_drvdata(card); + struct snd_soc_component *component = NULL; + struct skl_hda_hdmi_pcm *pcm; + char jack_name[NAME_SIZE]; + int err; + + list_for_each_entry(pcm, &ctx->hdmi_pcm_list, head) { + component = pcm->codec_dai->component; + snprintf(jack_name, sizeof(jack_name), + "HDMI/DP, pcm=%d Jack", pcm->device); + err = snd_soc_card_jack_new(card, jack_name, + SND_JACK_AVOUT, &pcm->hdmi_jack, + NULL, 0); + + if (err) + return err; + + err = hdac_hdmi_jack_init(pcm->codec_dai, pcm->device, + &pcm->hdmi_jack); + if (err < 0) + return err; + } + + if (!component) + return -EINVAL; + + return hdac_hdmi_jack_port_init(component, &card->dapm); +} diff --git a/sound/soc/intel/boards/skl_hda_dsp_common.h b/sound/soc/intel/boards/skl_hda_dsp_common.h new file mode 100644 index 000000000000..87c50aff56cd --- /dev/null +++ b/sound/soc/intel/boards/skl_hda_dsp_common.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2015-18 Intel Corporation. + */ + +/* + * This file defines data structures used in Machine Driver for Intel + * platforms with HDA Codecs. + */ + +#ifndef __SOUND_SOC_HDA_DSP_COMMON_H +#define __SOUND_SOC_HDA_DSP_COMMON_H +#include +#include +#include +#include + +#define HDA_DSP_MAX_BE_DAI_LINKS 5 + +struct skl_hda_hdmi_pcm { + struct list_head head; + struct snd_soc_dai *codec_dai; + struct snd_soc_jack hdmi_jack; + int device; +}; + +struct skl_hda_private { + struct list_head hdmi_pcm_list; + int pcm_count; + int dai_index; + const char *platform_name; +}; + +extern struct snd_soc_dai_link skl_hda_be_dai_links[HDA_DSP_MAX_BE_DAI_LINKS]; +int skl_hda_hdmi_jack_init(struct snd_soc_card *card); +int skl_hda_hdmi_add_pcm(struct snd_soc_card *card, int device); + +#endif /* __SOUND_SOC_HDA_DSP_COMMON_H */ diff --git a/sound/soc/intel/boards/skl_hda_dsp_generic.c b/sound/soc/intel/boards/skl_hda_dsp_generic.c new file mode 100644 index 000000000000..b415dd4c85f5 --- /dev/null +++ b/sound/soc/intel/boards/skl_hda_dsp_generic.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2015-18 Intel Corporation. + +/* + * Machine Driver for SKL+ platforms with DSP and iDisp, HDA Codecs + */ + +#include +#include +#include +#include +#include +#include +#include +#include "../../codecs/hdac_hdmi.h" +#include "../skylake/skl.h" +#include "skl_hda_dsp_common.h" + +static const struct snd_soc_dapm_widget skl_hda_widgets[] = { + SND_SOC_DAPM_HP("Analog Out", NULL), + SND_SOC_DAPM_MIC("Analog In", NULL), + SND_SOC_DAPM_HP("Alt Analog Out", NULL), + SND_SOC_DAPM_MIC("Alt Analog In", NULL), + SND_SOC_DAPM_SPK("Digital Out", NULL), + SND_SOC_DAPM_MIC("Digital In", NULL), +}; + +static const struct snd_soc_dapm_route skl_hda_map[] = { + { "hifi3", NULL, "iDisp3 Tx"}, + { "iDisp3 Tx", NULL, "iDisp3_out"}, + { "hifi2", NULL, "iDisp2 Tx"}, + { "iDisp2 Tx", NULL, "iDisp2_out"}, + { "hifi1", NULL, "iDisp1 Tx"}, + { "iDisp1 Tx", NULL, "iDisp1_out"}, + + { "Analog Out", NULL, "Codec Output Pin1" }, + { "Digital Out", NULL, "Codec Output Pin2" }, + { "Alt Analog Out", NULL, "Codec Output Pin3" }, + + { "Codec Input Pin1", NULL, "Analog In" }, + { "Codec Input Pin2", NULL, "Digital In" }, + { "Codec Input Pin3", NULL, "Alt Analog In" }, + + /* CODEC BE connections */ + { "Analog Codec Playback", NULL, "Analog CPU Playback" }, + { "Analog CPU Playback", NULL, "codec0_out" }, + { "Digital Codec Playback", NULL, "Digital CPU Playback" }, + { "Digital CPU Playback", NULL, "codec1_out" }, + { "Alt Analog Codec Playback", NULL, "Alt Analog CPU Playback" }, + { "Alt Analog CPU Playback", NULL, "codec2_out" }, + + { "codec0_in", NULL, "Analog CPU Capture" }, + { "Analog CPU Capture", NULL, "Analog Codec Capture" }, + { "codec1_in", NULL, "Digital CPU Capture" }, + { "Digital CPU Capture", NULL, "Digital Codec Capture" }, + { "codec2_in", NULL, "Alt Analog CPU Capture" }, + { "Alt Analog CPU Capture", NULL, "Alt Analog Codec Capture" }, +}; + +static int skl_hda_card_late_probe(struct snd_soc_card *card) +{ + return skl_hda_hdmi_jack_init(card); +} + +static int +skl_hda_add_dai_link(struct snd_soc_card *card, struct snd_soc_dai_link *link) +{ + struct skl_hda_private *ctx = snd_soc_card_get_drvdata(card); + int ret = 0; + + dev_dbg(card->dev, "%s: dai link name - %s\n", __func__, link->name); + link->platform_name = ctx->platform_name; + link->nonatomic = 1; + + if (strstr(link->name, "HDMI")) { + ret = skl_hda_hdmi_add_pcm(card, ctx->pcm_count); + + if (ret < 0) + return ret; + + ctx->dai_index++; + } + + ctx->pcm_count++; + return ret; +} + +static struct snd_soc_card hda_soc_card = { + .name = "skl_hda_card", + .owner = THIS_MODULE, + .dai_link = skl_hda_be_dai_links, + .dapm_widgets = skl_hda_widgets, + .dapm_routes = skl_hda_map, + .add_dai_link = skl_hda_add_dai_link, + .fully_routed = true, + .late_probe = skl_hda_card_late_probe, +}; + +#define IDISP_DAI_COUNT 3 +/* there are two routes per iDisp output */ +#define IDISP_ROUTE_COUNT (IDISP_DAI_COUNT * 2) +#define IDISP_CODEC_MASK 0x4 + +static int skl_hda_fill_card_info(struct skl_machine_pdata *pdata) +{ + struct snd_soc_card *card = &hda_soc_card; + struct snd_soc_dai_link *dai_link; + u32 codec_count, codec_mask; + int i, num_links, num_route; + + codec_mask = pdata->codec_mask; + codec_count = hweight_long(codec_mask); + + if (codec_count == 1 && pdata->codec_mask & IDISP_CODEC_MASK) { + num_links = IDISP_DAI_COUNT; + num_route = IDISP_ROUTE_COUNT; + } else if (codec_count == 2 && codec_mask & IDISP_CODEC_MASK) { + num_links = ARRAY_SIZE(skl_hda_be_dai_links); + num_route = ARRAY_SIZE(skl_hda_map), + card->dapm_widgets = skl_hda_widgets; + card->num_dapm_widgets = ARRAY_SIZE(skl_hda_widgets); + } else { + return -EINVAL; + } + + card->num_links = num_links; + card->num_dapm_routes = num_route; + + for_each_card_prelinks(card, i, dai_link) + dai_link->platform_name = pdata->platform; + + return 0; +} + +static int skl_hda_audio_probe(struct platform_device *pdev) +{ + struct skl_machine_pdata *pdata; + struct skl_hda_private *ctx; + int ret; + + dev_dbg(&pdev->dev, "%s: entry\n", __func__); + + ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx), GFP_ATOMIC); + if (!ctx) + return -ENOMEM; + + INIT_LIST_HEAD(&ctx->hdmi_pcm_list); + + pdata = dev_get_drvdata(&pdev->dev); + if (!pdata) + return -EINVAL; + + ret = skl_hda_fill_card_info(pdata); + if (ret < 0) { + dev_err(&pdev->dev, "Unsupported HDAudio/iDisp configuration found\n"); + return ret; + } + + ctx->pcm_count = hda_soc_card.num_links; + ctx->dai_index = 1; /* hdmi codec dai name starts from index 1 */ + ctx->platform_name = pdata->platform; + + hda_soc_card.dev = &pdev->dev; + snd_soc_card_set_drvdata(&hda_soc_card, ctx); + + return devm_snd_soc_register_card(&pdev->dev, &hda_soc_card); +} + +static struct platform_driver skl_hda_audio = { + .probe = skl_hda_audio_probe, + .driver = { + .name = "skl_hda_dsp_generic", + .pm = &snd_soc_pm_ops, + }, +}; + +module_platform_driver(skl_hda_audio) + +/* Module information */ +MODULE_DESCRIPTION("SKL/KBL/BXT/APL HDA Generic Machine driver"); +MODULE_AUTHOR("Rakesh Ughreja "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:skl_hda_dsp_generic"); diff --git a/sound/soc/intel/common/Makefile b/sound/soc/intel/common/Makefile index 915a34cdc8ac..c1f50a079d34 100644 --- a/sound/soc/intel/common/Makefile +++ b/sound/soc/intel/common/Makefile @@ -7,7 +7,8 @@ snd-soc-acpi-intel-match-objs := soc-acpi-intel-byt-match.o soc-acpi-intel-cht-m soc-acpi-intel-hsw-bdw-match.o \ soc-acpi-intel-skl-match.o soc-acpi-intel-kbl-match.o \ soc-acpi-intel-bxt-match.o soc-acpi-intel-glk-match.o \ - soc-acpi-intel-cnl-match.o + soc-acpi-intel-cnl-match.o \ + soc-acpi-intel-hda-match.o obj-$(CONFIG_SND_SOC_INTEL_SST) += snd-soc-sst-dsp.o snd-soc-sst-ipc.o obj-$(CONFIG_SND_SOC_INTEL_SST_ACPI) += snd-soc-sst-acpi.o diff --git a/sound/soc/intel/common/soc-acpi-intel-byt-match.c b/sound/soc/intel/common/soc-acpi-intel-byt-match.c index 4daa8a4f0c0c..097dc06377ba 100644 --- a/sound/soc/intel/common/soc-acpi-intel-byt-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-byt-match.c @@ -30,6 +30,13 @@ static int byt_thinkpad10_quirk_cb(const struct dmi_system_id *id) static const struct dmi_system_id byt_table[] = { + { + .callback = byt_thinkpad10_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad 8"), + }, + }, { .callback = byt_thinkpad10_quirk_cb, .matches = { diff --git a/sound/soc/intel/common/soc-acpi-intel-hda-match.c b/sound/soc/intel/common/soc-acpi-intel-hda-match.c new file mode 100644 index 000000000000..533c1064f84b --- /dev/null +++ b/sound/soc/intel/common/soc-acpi-intel-hda-match.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018, Intel Corporation. + +/* + * soc-apci-intel-hda-match.c - tables and support for HDA+ACPI enumeration. + * + */ + +#include +#include +#include "../skylake/skl.h" + +static struct skl_machine_pdata hda_pdata = { + .use_tplg_pcm = true, +}; + +struct snd_soc_acpi_mach snd_soc_acpi_intel_hda_machines[] = { + { + /* .id is not used in this file */ + .drv_name = "skl_hda_dsp_generic", + + /* .fw_filename is dynamically set in skylake driver */ + + /* .sof_fw_filename is dynamically set in sof/intel driver */ + + .sof_tplg_filename = "intel/sof-hda-generic.tplg", + + /* + * .machine_quirk and .quirk_data are not used here but + * can be used if we need a more complicated machine driver + * combining HDA+other device (e.g. DMIC). + */ + .pdata = &hda_pdata, + }, + {}, +}; +EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_hda_machines); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Intel Common ACPI Match module"); diff --git a/sound/soc/intel/common/soc-acpi-intel-kbl-match.c b/sound/soc/intel/common/soc-acpi-intel-kbl-match.c index 0ee173ca437d..a317b7790fce 100644 --- a/sound/soc/intel/common/soc-acpi-intel-kbl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-kbl-match.c @@ -32,6 +32,11 @@ static struct snd_soc_acpi_codecs kbl_7219_98357_codecs = { .codecs = {"MX98357A"} }; +static struct snd_soc_acpi_codecs kbl_7219_98927_codecs = { + .num_codecs = 1, + .codecs = {"MX98927"} +}; + struct snd_soc_acpi_mach snd_soc_acpi_intel_kbl_machines[] = { { .id = "INT343A", @@ -83,6 +88,14 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_kbl_machines[] = { .quirk_data = &kbl_7219_98357_codecs, .pdata = &skl_dmic_data, }, + { + .id = "DLGS7219", + .drv_name = "kbl_da7219_max98927", + .fw_filename = "intel/dsp_fw_kbl.bin", + .machine_quirk = snd_soc_acpi_codec_list, + .quirk_data = &kbl_7219_98927_codecs, + .pdata = &skl_dmic_data + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_kbl_machines); diff --git a/sound/soc/intel/common/sst-firmware.c b/sound/soc/intel/common/sst-firmware.c index 11041aedea31..1e067504b604 100644 --- a/sound/soc/intel/common/sst-firmware.c +++ b/sound/soc/intel/common/sst-firmware.c @@ -355,7 +355,7 @@ struct sst_fw *sst_fw_new(struct sst_dsp *dsp, /* allocate DMA buffer to store FW data */ sst_fw->dma_buf = dma_alloc_coherent(dsp->dma_dev, sst_fw->size, - &sst_fw->dmable_fw_paddr, GFP_DMA | GFP_KERNEL); + &sst_fw->dmable_fw_paddr, GFP_KERNEL); if (!sst_fw->dma_buf) { dev_err(dsp->dev, "error: DMA alloc failed\n"); kfree(sst_fw); diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c index 823e39103edd..557f80c0bfe5 100644 --- a/sound/soc/intel/skylake/skl-pcm.c +++ b/sound/soc/intel/skylake/skl-pcm.c @@ -32,6 +32,7 @@ #define HDA_MONO 1 #define HDA_STEREO 2 #define HDA_QUAD 4 +#define HDA_MAX 8 static const struct snd_pcm_hardware azx_pcm_hw = { .info = (SNDRV_PCM_INFO_MMAP | @@ -494,6 +495,7 @@ static int skl_pcm_trigger(struct snd_pcm_substream *substream, int cmd, stream->lpib); snd_hdac_ext_stream_set_lpib(stream, stream->lpib); } + /* fall through */ case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: @@ -569,7 +571,10 @@ static int skl_link_hw_params(struct snd_pcm_substream *substream, stream_tag = hdac_stream(link_dev)->stream_tag; /* set the stream tag in the codec dai dma params */ - snd_soc_dai_set_tdm_slot(codec_dai, stream_tag, 0, 0, 0); + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + snd_soc_dai_set_tdm_slot(codec_dai, stream_tag, 0, 0, 0); + else + snd_soc_dai_set_tdm_slot(codec_dai, 0, stream_tag, 0, 0); p_params.s_fmt = snd_pcm_format_width(params_format(params)); p_params.ch = params_channels(params); @@ -995,21 +1000,63 @@ static struct snd_soc_dai_driver skl_platform_dai[] = { }, }, { - .name = "HD-Codec Pin", + .name = "Analog CPU DAI", .ops = &skl_link_dai_ops, .playback = { - .stream_name = "HD-Codec Tx", - .channels_min = HDA_STEREO, - .channels_max = HDA_STEREO, - .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE, + .stream_name = "Analog CPU Playback", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, }, .capture = { - .stream_name = "HD-Codec Rx", - .channels_min = HDA_STEREO, - .channels_max = HDA_STEREO, - .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE, + .stream_name = "Analog CPU Capture", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, + }, +}, +{ + .name = "Alt Analog CPU DAI", + .ops = &skl_link_dai_ops, + .playback = { + .stream_name = "Alt Analog CPU Playback", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, + }, + .capture = { + .stream_name = "Alt Analog CPU Capture", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, + }, +}, +{ + .name = "Digital CPU DAI", + .ops = &skl_link_dai_ops, + .playback = { + .stream_name = "Digital CPU Playback", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, + }, + .capture = { + .stream_name = "Digital CPU Capture", + .channels_min = HDA_MONO, + .channels_max = HDA_MAX, + .rates = SNDRV_PCM_RATE_8000_192000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE, }, }, }; diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 2620d77729c5..cf8848b779dc 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -898,11 +898,10 @@ static int skl_tplg_set_module_bind_params(struct snd_soc_dapm_widget *w, bc = (struct skl_algo_data *)sb->dobj.private; if (bc->set_params == SKL_PARAM_BIND) { - params = kzalloc(bc->max, GFP_KERNEL); + params = kmemdup(bc->params, bc->max, GFP_KERNEL); if (!params) return -ENOMEM; - memcpy(params, bc->params, bc->max); skl_fill_sink_instance_id(ctx, params, bc->max, mconfig); @@ -2461,6 +2460,7 @@ static int skl_tplg_get_token(struct device *dev, case SKL_TKN_U8_CORE_ID: mconfig->core_id = tkn_elem->value; + break; case SKL_TKN_U8_MOD_TYPE: mconfig->m_type = tkn_elem->value; diff --git a/sound/soc/intel/skylake/skl.c b/sound/soc/intel/skylake/skl.c index 1d17be0f78a0..29225623b4b4 100644 --- a/sound/soc/intel/skylake/skl.c +++ b/sound/soc/intel/skylake/skl.c @@ -33,9 +33,11 @@ #include #include #include +#include #include "skl.h" #include "skl-sst-dsp.h" #include "skl-sst-ipc.h" +#include "../../../soc/codecs/hdac_hda.h" /* * initialize the PCI registers @@ -472,6 +474,25 @@ static struct skl_ssp_clk skl_ssp_clks[] = { {.name = "ssp5_sclkfs"}, }; +static struct snd_soc_acpi_mach *skl_find_hda_machine(struct skl *skl, + struct snd_soc_acpi_mach *machines) +{ + struct hdac_bus *bus = skl_to_bus(skl); + struct snd_soc_acpi_mach *mach; + + /* check if we have any codecs detected on bus */ + if (bus->codec_mask == 0) + return NULL; + + /* point to common table */ + mach = snd_soc_acpi_intel_hda_machines; + + /* all entries in the machine table use the same firmware */ + mach->fw_filename = machines->fw_filename; + + return mach; +} + static int skl_find_machine(struct skl *skl, void *driver_data) { struct hdac_bus *bus = skl_to_bus(skl); @@ -479,9 +500,13 @@ static int skl_find_machine(struct skl *skl, void *driver_data) struct skl_machine_pdata *pdata; mach = snd_soc_acpi_find_machine(mach); - if (mach == NULL) { - dev_err(bus->dev, "No matching machine driver found\n"); - return -ENODEV; + if (!mach) { + dev_dbg(bus->dev, "No matching I2S machine driver found\n"); + mach = skl_find_hda_machine(skl, driver_data); + if (!mach) { + dev_err(bus->dev, "No matching machine driver found\n"); + return -ENODEV; + } } skl->mach = mach; @@ -498,8 +523,9 @@ static int skl_find_machine(struct skl *skl, void *driver_data) static int skl_machine_device_register(struct skl *skl) { - struct hdac_bus *bus = skl_to_bus(skl); struct snd_soc_acpi_mach *mach = skl->mach; + struct hdac_bus *bus = skl_to_bus(skl); + struct skl_machine_pdata *pdata; struct platform_device *pdev; int ret; @@ -516,8 +542,12 @@ static int skl_machine_device_register(struct skl *skl) return -EIO; } - if (mach->pdata) + if (mach->pdata) { + pdata = (struct skl_machine_pdata *)mach->pdata; + pdata->platform = dev_name(bus->dev); + pdata->codec_mask = bus->codec_mask; dev_set_drvdata(&pdev->dev, mach->pdata); + } skl->i2s_dev = pdev; @@ -628,6 +658,24 @@ static void skl_clock_device_unregister(struct skl *skl) platform_device_unregister(skl->clk_dev); } +#define IDISP_INTEL_VENDOR_ID 0x80860000 + +/* + * load the legacy codec driver + */ +static void load_codec_module(struct hda_codec *codec) +{ +#ifdef MODULE + char modalias[MODULE_NAME_LEN]; + const char *mod = NULL; + + snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias)); + mod = modalias; + dev_dbg(&codec->core.dev, "loading %s codec module\n", mod); + request_module(mod); +#endif +} + /* * Probe the given codec address */ @@ -637,7 +685,9 @@ static int probe_codec(struct hdac_bus *bus, int addr) (AC_VERB_PARAMETERS << 8) | AC_PAR_VENDOR_ID; unsigned int res = -1; struct skl *skl = bus_to_skl(bus); + struct hdac_hda_priv *hda_codec; struct hdac_device *hdev; + int err; mutex_lock(&bus->cmd_mutex); snd_hdac_bus_send_cmd(bus, cmd); @@ -645,13 +695,26 @@ static int probe_codec(struct hdac_bus *bus, int addr) mutex_unlock(&bus->cmd_mutex); if (res == -1) return -EIO; - dev_dbg(bus->dev, "codec #%d probed OK\n", addr); + dev_dbg(bus->dev, "codec #%d probed OK: %x\n", addr, res); - hdev = devm_kzalloc(&skl->pci->dev, sizeof(*hdev), GFP_KERNEL); - if (!hdev) + hda_codec = devm_kzalloc(&skl->pci->dev, sizeof(*hda_codec), + GFP_KERNEL); + if (!hda_codec) return -ENOMEM; - return snd_hdac_ext_bus_device_init(bus, addr, hdev); + hda_codec->codec.bus = skl_to_hbus(skl); + hdev = &hda_codec->codec.core; + + err = snd_hdac_ext_bus_device_init(bus, addr, hdev); + if (err < 0) + return err; + + /* use legacy bus only for HDA codecs, idisp uses ext bus */ + if ((res & 0xFFFF0000) != IDISP_INTEL_VENDOR_ID) { + hdev->type = HDA_DEV_LEGACY; + load_codec_module(&hda_codec->codec); + } + return 0; } /* Codec initialization */ @@ -786,9 +849,10 @@ static int skl_create(struct pci_dev *pci, const struct hdac_io_ops *io_ops, struct skl **rskl) { + struct hdac_ext_bus_ops *ext_ops = NULL; struct skl *skl; struct hdac_bus *bus; - + struct hda_bus *hbus; int err; *rskl = NULL; @@ -803,13 +867,23 @@ static int skl_create(struct pci_dev *pci, return -ENOMEM; } + hbus = skl_to_hbus(skl); bus = skl_to_bus(skl); - snd_hdac_ext_bus_init(bus, &pci->dev, &bus_core_ops, io_ops, NULL); + +#if IS_ENABLED(CONFIG_SND_SOC_HDAC_HDA) + ext_ops = snd_soc_hdac_hda_get_ops(); +#endif + snd_hdac_ext_bus_init(bus, &pci->dev, &bus_core_ops, io_ops, ext_ops); bus->use_posbuf = 1; skl->pci = pci; INIT_WORK(&skl->probe_work, skl_probe_work); bus->bdl_pos_adj = 0; + mutex_init(&hbus->prepare_mutex); + hbus->pci = pci; + hbus->mixer_assigned = -1; + hbus->modelname = "sklbus"; + *rskl = skl; return 0; diff --git a/sound/soc/intel/skylake/skl.h b/sound/soc/intel/skylake/skl.h index 78aa8bdcb619..8d48cd7c56c8 100644 --- a/sound/soc/intel/skylake/skl.h +++ b/sound/soc/intel/skylake/skl.h @@ -23,6 +23,7 @@ #include #include +#include #include #include "skl-nhlt.h" #include "skl-ssp-clk.h" @@ -71,7 +72,7 @@ struct skl_fw_config { }; struct skl { - struct hdac_bus hbus; + struct hda_bus hbus; struct pci_dev *pci; unsigned int init_done:1; /* delayed init status */ @@ -105,8 +106,11 @@ struct skl { struct snd_soc_acpi_mach *mach; }; -#define skl_to_bus(s) (&(s)->hbus) -#define bus_to_skl(bus) container_of(bus, struct skl, hbus) +#define skl_to_bus(s) (&(s)->hbus.core) +#define bus_to_skl(bus) container_of(bus, struct skl, hbus.core) + +#define skl_to_hbus(s) (&(s)->hbus) +#define hbus_to_skl(hbus) container_of((hbus), struct skl, (hbus)) /* to pass dai dma data */ struct skl_dma_params { @@ -117,6 +121,8 @@ struct skl_dma_params { struct skl_machine_pdata { u32 dmic_num; bool use_tplg_pcm; /* use dais and dai links from topology */ + const char *platform; + u32 codec_mask; }; struct skl_dsp_ops { diff --git a/sound/soc/mediatek/mt2701/mt2701-cs42448.c b/sound/soc/mediatek/mt2701/mt2701-cs42448.c index 666282b865a8..97f9f38ce6b3 100644 --- a/sound/soc/mediatek/mt2701/mt2701-cs42448.c +++ b/sound/soc/mediatek/mt2701/mt2701-cs42448.c @@ -299,6 +299,7 @@ static int mt2701_cs42448_machine_probe(struct platform_device *pdev) devm_kzalloc(&pdev->dev, sizeof(struct mt2701_cs42448_private), GFP_KERNEL); struct device *dev = &pdev->dev; + struct snd_soc_dai_link *dai_link; if (!priv) return -ENOMEM; @@ -309,10 +310,10 @@ static int mt2701_cs42448_machine_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Property 'platform' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt2701_cs42448_dai_links[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt2701_cs42448_dai_links[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } card->dev = dev; @@ -324,10 +325,10 @@ static int mt2701_cs42448_machine_probe(struct platform_device *pdev) "Property 'audio-codec' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt2701_cs42448_dai_links[i].codec_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->codec_name) continue; - mt2701_cs42448_dai_links[i].codec_of_node = codec_node; + dai_link->codec_of_node = codec_node; } codec_node_bt_mrg = of_parse_phandle(pdev->dev.of_node, diff --git a/sound/soc/mediatek/mt2701/mt2701-wm8960.c b/sound/soc/mediatek/mt2701/mt2701-wm8960.c index 89f34efd9747..6bc1d3d58e64 100644 --- a/sound/soc/mediatek/mt2701/mt2701-wm8960.c +++ b/sound/soc/mediatek/mt2701/mt2701-wm8960.c @@ -97,6 +97,7 @@ static int mt2701_wm8960_machine_probe(struct platform_device *pdev) { struct snd_soc_card *card = &mt2701_wm8960_card; struct device_node *platform_node, *codec_node; + struct snd_soc_dai_link *dai_link; int ret, i; platform_node = of_parse_phandle(pdev->dev.of_node, @@ -105,10 +106,10 @@ static int mt2701_wm8960_machine_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Property 'platform' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt2701_wm8960_dai_links[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt2701_wm8960_dai_links[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } card->dev = &pdev->dev; @@ -120,10 +121,10 @@ static int mt2701_wm8960_machine_probe(struct platform_device *pdev) "Property 'audio-codec' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt2701_wm8960_dai_links[i].codec_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->codec_name) continue; - mt2701_wm8960_dai_links[i].codec_of_node = codec_node; + dai_link->codec_of_node = codec_node; } ret = snd_soc_of_parse_audio_routing(card, "audio-routing"); @@ -150,7 +151,6 @@ static const struct of_device_id mt2701_wm8960_machine_dt_match[] = { static struct platform_driver mt2701_wm8960_machine = { .driver = { .name = "mt2701-wm8960", - .owner = THIS_MODULE, #ifdef CONFIG_OF .of_match_table = mt2701_wm8960_machine_dt_match, #endif diff --git a/sound/soc/mediatek/mt6797/mt6797-mt6351.c b/sound/soc/mediatek/mt6797/mt6797-mt6351.c index b1558c57b9ca..cc41eb531653 100644 --- a/sound/soc/mediatek/mt6797/mt6797-mt6351.c +++ b/sound/soc/mediatek/mt6797/mt6797-mt6351.c @@ -158,6 +158,7 @@ static int mt6797_mt6351_dev_probe(struct platform_device *pdev) { struct snd_soc_card *card = &mt6797_mt6351_card; struct device_node *platform_node, *codec_node; + struct snd_soc_dai_link *dai_link; int ret, i; card->dev = &pdev->dev; @@ -168,10 +169,10 @@ static int mt6797_mt6351_dev_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Property 'platform' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt6797_mt6351_dai_links[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt6797_mt6351_dai_links[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } codec_node = of_parse_phandle(pdev->dev.of_node, @@ -181,10 +182,10 @@ static int mt6797_mt6351_dev_probe(struct platform_device *pdev) "Property 'audio-codec' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt6797_mt6351_dai_links[i].codec_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->codec_name) continue; - mt6797_mt6351_dai_links[i].codec_of_node = codec_node; + dai_link->codec_of_node = codec_node; } ret = devm_snd_soc_register_card(&pdev->dev, card); @@ -205,7 +206,6 @@ static const struct of_device_id mt6797_mt6351_dt_match[] = { static struct platform_driver mt6797_mt6351_driver = { .driver = { .name = "mt6797-mt6351", - .owner = THIS_MODULE, #ifdef CONFIG_OF .of_match_table = mt6797_mt6351_dt_match, #endif diff --git a/sound/soc/mediatek/mt8173/mt8173-max98090.c b/sound/soc/mediatek/mt8173/mt8173-max98090.c index 902d111016d6..4d6596d5cb07 100644 --- a/sound/soc/mediatek/mt8173/mt8173-max98090.c +++ b/sound/soc/mediatek/mt8173/mt8173-max98090.c @@ -137,6 +137,7 @@ static int mt8173_max98090_dev_probe(struct platform_device *pdev) { struct snd_soc_card *card = &mt8173_max98090_card; struct device_node *codec_node, *platform_node; + struct snd_soc_dai_link *dai_link; int ret, i; platform_node = of_parse_phandle(pdev->dev.of_node, @@ -145,10 +146,10 @@ static int mt8173_max98090_dev_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Property 'platform' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt8173_max98090_dais[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt8173_max98090_dais[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } codec_node = of_parse_phandle(pdev->dev.of_node, @@ -158,10 +159,10 @@ static int mt8173_max98090_dev_probe(struct platform_device *pdev) "Property 'audio-codec' missing or invalid\n"); return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt8173_max98090_dais[i].codec_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->codec_name) continue; - mt8173_max98090_dais[i].codec_of_node = codec_node; + dai_link->codec_of_node = codec_node; } card->dev = &pdev->dev; diff --git a/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5514.c b/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5514.c index 582174d98c6c..da5b58ce791b 100644 --- a/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5514.c +++ b/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5514.c @@ -44,11 +44,10 @@ static int mt8173_rt5650_rt5514_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_dai *codec_dai; int i, ret; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; - + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* pll from mclk 12.288M */ ret = snd_soc_dai_set_pll(codec_dai, 0, 0, MCLK_FOR_CODECS, params_rate(params) * 512); @@ -179,6 +178,7 @@ static int mt8173_rt5650_rt5514_dev_probe(struct platform_device *pdev) { struct snd_soc_card *card = &mt8173_rt5650_rt5514_card; struct device_node *platform_node; + struct snd_soc_dai_link *dai_link; int i, ret; platform_node = of_parse_phandle(pdev->dev.of_node, @@ -188,10 +188,10 @@ static int mt8173_rt5650_rt5514_dev_probe(struct platform_device *pdev) return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt8173_rt5650_rt5514_dais[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt8173_rt5650_rt5514_dais[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } mt8173_rt5650_rt5514_codecs[0].of_node = diff --git a/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5676.c b/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5676.c index b3670c8a5b8d..d83cd039b413 100644 --- a/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5676.c +++ b/sound/soc/mediatek/mt8173/mt8173-rt5650-rt5676.c @@ -48,11 +48,10 @@ static int mt8173_rt5650_rt5676_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_dai *codec_dai; int i, ret; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; - + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* pll from mclk 12.288M */ ret = snd_soc_dai_set_pll(codec_dai, 0, 0, MCLK_FOR_CODECS, params_rate(params) * 512); @@ -225,6 +224,7 @@ static int mt8173_rt5650_rt5676_dev_probe(struct platform_device *pdev) { struct snd_soc_card *card = &mt8173_rt5650_rt5676_card; struct device_node *platform_node; + struct snd_soc_dai_link *dai_link; int i, ret; platform_node = of_parse_phandle(pdev->dev.of_node, @@ -234,10 +234,10 @@ static int mt8173_rt5650_rt5676_dev_probe(struct platform_device *pdev) return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt8173_rt5650_rt5676_dais[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt8173_rt5650_rt5676_dais[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } mt8173_rt5650_rt5676_codecs[0].of_node = diff --git a/sound/soc/mediatek/mt8173/mt8173-rt5650.c b/sound/soc/mediatek/mt8173/mt8173-rt5650.c index 7a89b4aad182..7edf250c8fb1 100644 --- a/sound/soc/mediatek/mt8173/mt8173-rt5650.c +++ b/sound/soc/mediatek/mt8173/mt8173-rt5650.c @@ -59,6 +59,7 @@ static int mt8173_rt5650_hw_params(struct snd_pcm_substream *substream, { struct snd_soc_pcm_runtime *rtd = substream->private_data; unsigned int mclk_clock; + struct snd_soc_dai *codec_dai; int i, ret; switch (mt8173_rt5650_priv.pll_from) { @@ -76,9 +77,7 @@ static int mt8173_rt5650_hw_params(struct snd_pcm_substream *substream, break; } - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; - + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* pll from mclk */ ret = snd_soc_dai_set_pll(codec_dai, 0, 0, mclk_clock, params_rate(params) * 512); @@ -240,6 +239,7 @@ static int mt8173_rt5650_dev_probe(struct platform_device *pdev) struct device_node *platform_node; struct device_node *np; const char *codec_capture_dai; + struct snd_soc_dai_link *dai_link; int i, ret; platform_node = of_parse_phandle(pdev->dev.of_node, @@ -249,10 +249,10 @@ static int mt8173_rt5650_dev_probe(struct platform_device *pdev) return -EINVAL; } - for (i = 0; i < card->num_links; i++) { - if (mt8173_rt5650_dais[i].platform_name) + for_each_card_prelinks(card, i, dai_link) { + if (dai_link->platform_name) continue; - mt8173_rt5650_dais[i].platform_of_node = platform_node; + dai_link->platform_of_node = platform_node; } mt8173_rt5650_codecs[0].of_node = diff --git a/sound/soc/meson/Kconfig b/sound/soc/meson/Kconfig index 8af8bc358a90..8b8426ed2363 100644 --- a/sound/soc/meson/Kconfig +++ b/sound/soc/meson/Kconfig @@ -4,6 +4,8 @@ menu "ASoC support for Amlogic platforms" config SND_MESON_AXG_FIFO tristate select REGMAP_MMIO + imply COMMON_CLK_AXG_AUDIO + imply RESET_MESON_AUDIO_ARB config SND_MESON_AXG_FRDDR tristate "Amlogic AXG Playback FIFO support" @@ -22,6 +24,7 @@ config SND_MESON_AXG_TODDR config SND_MESON_AXG_TDM_FORMATTER tristate select REGMAP_MMIO + imply COMMON_CLK_AXG_AUDIO config SND_MESON_AXG_TDM_INTERFACE tristate @@ -51,6 +54,7 @@ config SND_MESON_AXG_SOUND_CARD imply SND_MESON_AXG_TDMIN imply SND_MESON_AXG_TDMOUT imply SND_MESON_AXG_SPDIFOUT + imply SND_MESON_AXG_PDM help Select Y or M to add support for the AXG SoC sound card @@ -58,8 +62,17 @@ config SND_MESON_AXG_SPDIFOUT tristate "Amlogic AXG SPDIF Output Support" select SND_PCM_IEC958 imply SND_SOC_SPDIF + imply COMMON_CLK_AXG_AUDIO help Select Y or M to add support for SPDIF output serializer embedded in the Amlogic AXG SoC family +config SND_MESON_AXG_PDM + tristate "Amlogic AXG PDM Input Support" + imply SND_SOC_DMIC + imply COMMON_CLK_AXG_AUDIO + help + Select Y or M to add support for PDM input embedded + in the Amlogic AXG SoC family + endmenu diff --git a/sound/soc/meson/Makefile b/sound/soc/meson/Makefile index c5e003b093db..4cd25104029d 100644 --- a/sound/soc/meson/Makefile +++ b/sound/soc/meson/Makefile @@ -9,6 +9,7 @@ snd-soc-meson-axg-tdmin-objs := axg-tdmin.o snd-soc-meson-axg-tdmout-objs := axg-tdmout.o snd-soc-meson-axg-sound-card-objs := axg-card.o snd-soc-meson-axg-spdifout-objs := axg-spdifout.o +snd-soc-meson-axg-pdm-objs := axg-pdm.o obj-$(CONFIG_SND_MESON_AXG_FIFO) += snd-soc-meson-axg-fifo.o obj-$(CONFIG_SND_MESON_AXG_FRDDR) += snd-soc-meson-axg-frddr.o @@ -19,3 +20,4 @@ obj-$(CONFIG_SND_MESON_AXG_TDMIN) += snd-soc-meson-axg-tdmin.o obj-$(CONFIG_SND_MESON_AXG_TDMOUT) += snd-soc-meson-axg-tdmout.o obj-$(CONFIG_SND_MESON_AXG_SOUND_CARD) += snd-soc-meson-axg-sound-card.o obj-$(CONFIG_SND_MESON_AXG_SPDIFOUT) += snd-soc-meson-axg-spdifout.o +obj-$(CONFIG_SND_MESON_AXG_PDM) += snd-soc-meson-axg-pdm.o diff --git a/sound/soc/meson/axg-card.c b/sound/soc/meson/axg-card.c index 2914ba0d965b..aa54d2c612c9 100644 --- a/sound/soc/meson/axg-card.c +++ b/sound/soc/meson/axg-card.c @@ -97,14 +97,14 @@ static void axg_card_clean_references(struct axg_card *priv) { struct snd_soc_card *card = &priv->card; struct snd_soc_dai_link *link; + struct snd_soc_dai_link_component *codec; int i, j; if (card->dai_link) { - for (i = 0; i < card->num_links; i++) { - link = &card->dai_link[i]; + for_each_card_prelinks(card, i, link) { of_node_put(link->cpu_of_node); - for (j = 0; j < link->num_codecs; j++) - of_node_put(link->codecs[j].of_node); + for_each_link_codecs(link, j, codec) + of_node_put(codec->of_node); } } @@ -167,8 +167,7 @@ static int axg_card_tdm_be_hw_params(struct snd_pcm_substream *substream, if (be->mclk_fs) { mclk = params_rate(params) * be->mclk_fs; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { ret = snd_soc_dai_set_sysclk(codec_dai, 0, mclk, SND_SOC_CLOCK_IN); if (ret && ret != -ENOTSUPP) @@ -196,8 +195,7 @@ static int axg_card_tdm_dai_init(struct snd_soc_pcm_runtime *rtd) struct snd_soc_dai *codec_dai; int ret, i; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { ret = snd_soc_dai_set_tdm_slot(codec_dai, be->codec_masks[i].tx, be->codec_masks[i].rx, @@ -478,7 +476,7 @@ static int axg_card_set_be_link(struct snd_soc_card *card, ret = axg_card_set_link_name(card, link, "be"); if (ret) - dev_err(card->dev, "error setting %s link name\n", np->name); + dev_err(card->dev, "error setting %pOFn link name\n", np); return ret; } diff --git a/sound/soc/meson/axg-fifo.c b/sound/soc/meson/axg-fifo.c index 30262550e37b..0e4f65e654c4 100644 --- a/sound/soc/meson/axg-fifo.c +++ b/sound/soc/meson/axg-fifo.c @@ -203,6 +203,8 @@ static int axg_fifo_pcm_open(struct snd_pcm_substream *ss) ret = request_irq(fifo->irq, axg_fifo_pcm_irq_block, 0, dev_name(dev), ss); + if (ret) + return ret; /* Enable pclk to access registers and clock the fifo ip */ ret = clk_prepare_enable(fifo->pclk); diff --git a/sound/soc/meson/axg-pdm.c b/sound/soc/meson/axg-pdm.c new file mode 100644 index 000000000000..9d5684493ffc --- /dev/null +++ b/sound/soc/meson/axg-pdm.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +// +// Copyright (c) 2018 BayLibre, SAS. +// Author: Jerome Brunet + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PDM_CTRL 0x00 +#define PDM_CTRL_EN BIT(31) +#define PDM_CTRL_OUT_MODE BIT(29) +#define PDM_CTRL_BYPASS_MODE BIT(28) +#define PDM_CTRL_RST_FIFO BIT(16) +#define PDM_CTRL_CHAN_RSTN_MASK GENMASK(15, 8) +#define PDM_CTRL_CHAN_RSTN(x) ((x) << 8) +#define PDM_CTRL_CHAN_EN_MASK GENMASK(7, 0) +#define PDM_CTRL_CHAN_EN(x) ((x) << 0) +#define PDM_HCIC_CTRL1 0x04 +#define PDM_FILTER_EN BIT(31) +#define PDM_HCIC_CTRL1_GAIN_SFT_MASK GENMASK(29, 24) +#define PDM_HCIC_CTRL1_GAIN_SFT(x) ((x) << 24) +#define PDM_HCIC_CTRL1_GAIN_MULT_MASK GENMASK(23, 16) +#define PDM_HCIC_CTRL1_GAIN_MULT(x) ((x) << 16) +#define PDM_HCIC_CTRL1_DSR_MASK GENMASK(8, 4) +#define PDM_HCIC_CTRL1_DSR(x) ((x) << 4) +#define PDM_HCIC_CTRL1_STAGE_NUM_MASK GENMASK(3, 0) +#define PDM_HCIC_CTRL1_STAGE_NUM(x) ((x) << 0) +#define PDM_HCIC_CTRL2 0x08 +#define PDM_F1_CTRL 0x0c +#define PDM_LPF_ROUND_MODE_MASK GENMASK(17, 16) +#define PDM_LPF_ROUND_MODE(x) ((x) << 16) +#define PDM_LPF_DSR_MASK GENMASK(15, 12) +#define PDM_LPF_DSR(x) ((x) << 12) +#define PDM_LPF_STAGE_NUM_MASK GENMASK(8, 0) +#define PDM_LPF_STAGE_NUM(x) ((x) << 0) +#define PDM_LPF_MAX_STAGE 336 +#define PDM_LPF_NUM 3 +#define PDM_F2_CTRL 0x10 +#define PDM_F3_CTRL 0x14 +#define PDM_HPF_CTRL 0x18 +#define PDM_HPF_SFT_STEPS_MASK GENMASK(20, 16) +#define PDM_HPF_SFT_STEPS(x) ((x) << 16) +#define PDM_HPF_OUT_FACTOR_MASK GENMASK(15, 0) +#define PDM_HPF_OUT_FACTOR(x) ((x) << 0) +#define PDM_CHAN_CTRL 0x1c +#define PDM_CHAN_CTRL_POINTER_WIDTH 8 +#define PDM_CHAN_CTRL_POINTER_MAX ((1 << PDM_CHAN_CTRL_POINTER_WIDTH) - 1) +#define PDM_CHAN_CTRL_NUM 4 +#define PDM_CHAN_CTRL1 0x20 +#define PDM_COEFF_ADDR 0x24 +#define PDM_COEFF_DATA 0x28 +#define PDM_CLKG_CTRL 0x2c +#define PDM_STS 0x30 + +struct axg_pdm_lpf { + unsigned int ds; + unsigned int round_mode; + const unsigned int *tap; + unsigned int tap_num; +}; + +struct axg_pdm_hcic { + unsigned int shift; + unsigned int mult; + unsigned int steps; + unsigned int ds; +}; + +struct axg_pdm_hpf { + unsigned int out_factor; + unsigned int steps; +}; + +struct axg_pdm_filters { + struct axg_pdm_hcic hcic; + struct axg_pdm_hpf hpf; + struct axg_pdm_lpf lpf[PDM_LPF_NUM]; +}; + +struct axg_pdm_cfg { + const struct axg_pdm_filters *filters; + unsigned int sys_rate; +}; + +struct axg_pdm { + const struct axg_pdm_cfg *cfg; + struct regmap *map; + struct clk *dclk; + struct clk *sysclk; + struct clk *pclk; +}; + +static void axg_pdm_enable(struct regmap *map) +{ + /* Reset AFIFO */ + regmap_update_bits(map, PDM_CTRL, PDM_CTRL_RST_FIFO, PDM_CTRL_RST_FIFO); + regmap_update_bits(map, PDM_CTRL, PDM_CTRL_RST_FIFO, 0); + + /* Enable PDM */ + regmap_update_bits(map, PDM_CTRL, PDM_CTRL_EN, PDM_CTRL_EN); +} + +static void axg_pdm_disable(struct regmap *map) +{ + regmap_update_bits(map, PDM_CTRL, PDM_CTRL_EN, 0); +} + +static void axg_pdm_filters_enable(struct regmap *map, bool enable) +{ + unsigned int val = enable ? PDM_FILTER_EN : 0; + + regmap_update_bits(map, PDM_HCIC_CTRL1, PDM_FILTER_EN, val); + regmap_update_bits(map, PDM_F1_CTRL, PDM_FILTER_EN, val); + regmap_update_bits(map, PDM_F2_CTRL, PDM_FILTER_EN, val); + regmap_update_bits(map, PDM_F3_CTRL, PDM_FILTER_EN, val); + regmap_update_bits(map, PDM_HPF_CTRL, PDM_FILTER_EN, val); +} + +static int axg_pdm_trigger(struct snd_pcm_substream *substream, int cmd, + struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + + switch (cmd) { + case SNDRV_PCM_TRIGGER_START: + case SNDRV_PCM_TRIGGER_RESUME: + case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: + axg_pdm_enable(priv->map); + return 0; + + case SNDRV_PCM_TRIGGER_STOP: + case SNDRV_PCM_TRIGGER_SUSPEND: + case SNDRV_PCM_TRIGGER_PAUSE_PUSH: + axg_pdm_disable(priv->map); + return 0; + + default: + return -EINVAL; + } +} + +static unsigned int axg_pdm_get_os(struct axg_pdm *priv) +{ + const struct axg_pdm_filters *filters = priv->cfg->filters; + unsigned int os = filters->hcic.ds; + int i; + + /* + * The global oversampling factor is defined by the down sampling + * factor applied by each filter (HCIC and LPFs) + */ + + for (i = 0; i < PDM_LPF_NUM; i++) + os *= filters->lpf[i].ds; + + return os; +} + +static int axg_pdm_set_sysclk(struct axg_pdm *priv, unsigned int os, + unsigned int rate) +{ + unsigned int sys_rate = os * 2 * rate * PDM_CHAN_CTRL_POINTER_MAX; + + /* + * Set the default system clock rate unless it is too fast for + * for the requested sample rate. In this case, the sample pointer + * counter could overflow so set a lower system clock rate + */ + if (sys_rate < priv->cfg->sys_rate) + return clk_set_rate(priv->sysclk, sys_rate); + + return clk_set_rate(priv->sysclk, priv->cfg->sys_rate); +} + +static int axg_pdm_set_sample_pointer(struct axg_pdm *priv) +{ + unsigned int spmax, sp, val; + int i; + + /* Max sample counter value per half period of dclk */ + spmax = DIV_ROUND_UP_ULL((u64)clk_get_rate(priv->sysclk), + clk_get_rate(priv->dclk) * 2); + + /* Check if sysclk is not too fast - should not happen */ + if (WARN_ON(spmax > PDM_CHAN_CTRL_POINTER_MAX)) + return -EINVAL; + + /* Capture the data when we are at 75% of the half period */ + sp = spmax * 3 / 4; + + for (i = 0, val = 0; i < PDM_CHAN_CTRL_NUM; i++) + val |= sp << (PDM_CHAN_CTRL_POINTER_WIDTH * i); + + regmap_write(priv->map, PDM_CHAN_CTRL, val); + regmap_write(priv->map, PDM_CHAN_CTRL1, val); + + return 0; +} + +static void axg_pdm_set_channel_mask(struct axg_pdm *priv, + unsigned int channels) +{ + unsigned int mask = GENMASK(channels - 1, 0); + + /* Put all channel in reset */ + regmap_update_bits(priv->map, PDM_CTRL, + PDM_CTRL_CHAN_RSTN_MASK, 0); + + /* Take the necessary channels out of reset and enable them */ + regmap_update_bits(priv->map, PDM_CTRL, + PDM_CTRL_CHAN_RSTN_MASK | + PDM_CTRL_CHAN_EN_MASK, + PDM_CTRL_CHAN_RSTN(mask) | + PDM_CTRL_CHAN_EN(mask)); +} + +static int axg_pdm_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + unsigned int os = axg_pdm_get_os(priv); + unsigned int rate = params_rate(params); + unsigned int val; + int ret; + + switch (params_width(params)) { + case 24: + val = PDM_CTRL_OUT_MODE; + break; + case 32: + val = 0; + break; + default: + dev_err(dai->dev, "unsupported sample width\n"); + return -EINVAL; + } + + regmap_update_bits(priv->map, PDM_CTRL, PDM_CTRL_OUT_MODE, val); + + ret = axg_pdm_set_sysclk(priv, os, rate); + if (ret) { + dev_err(dai->dev, "failed to set system clock\n"); + return ret; + } + + ret = clk_set_rate(priv->dclk, rate * os); + if (ret) { + dev_err(dai->dev, "failed to set dclk\n"); + return ret; + } + + ret = axg_pdm_set_sample_pointer(priv); + if (ret) { + dev_err(dai->dev, "invalid clock setting\n"); + return ret; + } + + axg_pdm_set_channel_mask(priv, params_channels(params)); + + return 0; +} + +static int axg_pdm_startup(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + int ret; + + ret = clk_prepare_enable(priv->dclk); + if (ret) { + dev_err(dai->dev, "enabling dclk failed\n"); + return ret; + } + + /* Enable the filters */ + axg_pdm_filters_enable(priv->map, true); + + return ret; +} + +static void axg_pdm_shutdown(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + + axg_pdm_filters_enable(priv->map, false); + clk_disable_unprepare(priv->dclk); +} + +static const struct snd_soc_dai_ops axg_pdm_dai_ops = { + .trigger = axg_pdm_trigger, + .hw_params = axg_pdm_hw_params, + .startup = axg_pdm_startup, + .shutdown = axg_pdm_shutdown, +}; + +static void axg_pdm_set_hcic_ctrl(struct axg_pdm *priv) +{ + const struct axg_pdm_hcic *hcic = &priv->cfg->filters->hcic; + unsigned int val; + + val = PDM_HCIC_CTRL1_STAGE_NUM(hcic->steps); + val |= PDM_HCIC_CTRL1_DSR(hcic->ds); + val |= PDM_HCIC_CTRL1_GAIN_MULT(hcic->mult); + val |= PDM_HCIC_CTRL1_GAIN_SFT(hcic->shift); + + regmap_update_bits(priv->map, PDM_HCIC_CTRL1, + PDM_HCIC_CTRL1_STAGE_NUM_MASK | + PDM_HCIC_CTRL1_DSR_MASK | + PDM_HCIC_CTRL1_GAIN_MULT_MASK | + PDM_HCIC_CTRL1_GAIN_SFT_MASK, + val); +} + +static void axg_pdm_set_lpf_ctrl(struct axg_pdm *priv, unsigned int index) +{ + const struct axg_pdm_lpf *lpf = &priv->cfg->filters->lpf[index]; + unsigned int offset = index * regmap_get_reg_stride(priv->map) + + PDM_F1_CTRL; + unsigned int val; + + val = PDM_LPF_STAGE_NUM(lpf->tap_num); + val |= PDM_LPF_DSR(lpf->ds); + val |= PDM_LPF_ROUND_MODE(lpf->round_mode); + + regmap_update_bits(priv->map, offset, + PDM_LPF_STAGE_NUM_MASK | + PDM_LPF_DSR_MASK | + PDM_LPF_ROUND_MODE_MASK, + val); +} + +static void axg_pdm_set_hpf_ctrl(struct axg_pdm *priv) +{ + const struct axg_pdm_hpf *hpf = &priv->cfg->filters->hpf; + unsigned int val; + + val = PDM_HPF_OUT_FACTOR(hpf->out_factor); + val |= PDM_HPF_SFT_STEPS(hpf->steps); + + regmap_update_bits(priv->map, PDM_HPF_CTRL, + PDM_HPF_OUT_FACTOR_MASK | + PDM_HPF_SFT_STEPS_MASK, + val); +} + +static int axg_pdm_set_lpf_filters(struct axg_pdm *priv) +{ + const struct axg_pdm_lpf *lpf = priv->cfg->filters->lpf; + unsigned int count = 0; + int i, j; + + for (i = 0; i < PDM_LPF_NUM; i++) + count += lpf[i].tap_num; + + /* Make sure the coeffs fit in the memory */ + if (count >= PDM_LPF_MAX_STAGE) + return -EINVAL; + + /* Set the initial APB bus register address */ + regmap_write(priv->map, PDM_COEFF_ADDR, 0); + + /* Set the tap filter values of all 3 filters */ + for (i = 0; i < PDM_LPF_NUM; i++) { + axg_pdm_set_lpf_ctrl(priv, i); + + for (j = 0; j < lpf[i].tap_num; j++) + regmap_write(priv->map, PDM_COEFF_DATA, lpf[i].tap[j]); + } + + return 0; +} + +static int axg_pdm_dai_probe(struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + int ret; + + ret = clk_prepare_enable(priv->pclk); + if (ret) { + dev_err(dai->dev, "enabling pclk failed\n"); + return ret; + } + + /* + * sysclk must be set and enabled as well to access the pdm registers + * Accessing the register w/o it will give a bus error. + */ + ret = clk_set_rate(priv->sysclk, priv->cfg->sys_rate); + if (ret) { + dev_err(dai->dev, "setting sysclk failed\n"); + goto err_pclk; + } + + ret = clk_prepare_enable(priv->sysclk); + if (ret) { + dev_err(dai->dev, "enabling sysclk failed\n"); + goto err_pclk; + } + + /* Make sure the device is initially disabled */ + axg_pdm_disable(priv->map); + + /* Make sure filter bypass is disabled */ + regmap_update_bits(priv->map, PDM_CTRL, PDM_CTRL_BYPASS_MODE, 0); + + /* Load filter settings */ + axg_pdm_set_hcic_ctrl(priv); + axg_pdm_set_hpf_ctrl(priv); + + ret = axg_pdm_set_lpf_filters(priv); + if (ret) { + dev_err(dai->dev, "invalid filter configuration\n"); + goto err_sysclk; + } + + return 0; + +err_sysclk: + clk_disable_unprepare(priv->sysclk); +err_pclk: + clk_disable_unprepare(priv->pclk); + return ret; +} + +static int axg_pdm_dai_remove(struct snd_soc_dai *dai) +{ + struct axg_pdm *priv = snd_soc_dai_get_drvdata(dai); + + clk_disable_unprepare(priv->sysclk); + clk_disable_unprepare(priv->pclk); + + return 0; +} + +static struct snd_soc_dai_driver axg_pdm_dai_drv = { + .name = "PDM", + .capture = { + .stream_name = "Capture", + .channels_min = 1, + .channels_max = 8, + .rates = SNDRV_PCM_RATE_CONTINUOUS, + .rate_min = 5512, + .rate_max = 48000, + .formats = (SNDRV_PCM_FMTBIT_S24_LE | + SNDRV_PCM_FMTBIT_S32_LE), + }, + .ops = &axg_pdm_dai_ops, + .probe = axg_pdm_dai_probe, + .remove = axg_pdm_dai_remove, +}; + +static const struct snd_soc_component_driver axg_pdm_component_drv = {}; + +static const struct regmap_config axg_pdm_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .max_register = PDM_STS, +}; + +static const unsigned int lpf1_default_tap[] = { + 0x000014, 0xffffb2, 0xfffed9, 0xfffdce, 0xfffd45, + 0xfffe32, 0x000147, 0x000645, 0x000b86, 0x000e21, + 0x000ae3, 0x000000, 0xffeece, 0xffdca8, 0xffd212, + 0xffd7d1, 0xfff2a7, 0x001f4c, 0x0050c2, 0x0072aa, + 0x006ff1, 0x003c32, 0xffdc4e, 0xff6a18, 0xff0fef, + 0xfefbaf, 0xff4c40, 0x000000, 0x00ebc8, 0x01c077, + 0x02209e, 0x01c1a4, 0x008e60, 0xfebe52, 0xfcd690, + 0xfb8fa5, 0xfba498, 0xfd9812, 0x0181ce, 0x06f5f3, + 0x0d112f, 0x12a958, 0x169686, 0x18000e, 0x169686, + 0x12a958, 0x0d112f, 0x06f5f3, 0x0181ce, 0xfd9812, + 0xfba498, 0xfb8fa5, 0xfcd690, 0xfebe52, 0x008e60, + 0x01c1a4, 0x02209e, 0x01c077, 0x00ebc8, 0x000000, + 0xff4c40, 0xfefbaf, 0xff0fef, 0xff6a18, 0xffdc4e, + 0x003c32, 0x006ff1, 0x0072aa, 0x0050c2, 0x001f4c, + 0xfff2a7, 0xffd7d1, 0xffd212, 0xffdca8, 0xffeece, + 0x000000, 0x000ae3, 0x000e21, 0x000b86, 0x000645, + 0x000147, 0xfffe32, 0xfffd45, 0xfffdce, 0xfffed9, + 0xffffb2, 0x000014, +}; + +static const unsigned int lpf2_default_tap[] = { + 0x00050a, 0xfff004, 0x0002c1, 0x003c12, 0xffa818, + 0xffc87d, 0x010aef, 0xff5223, 0xfebd93, 0x028f41, + 0xff5c0e, 0xfc63f8, 0x055f81, 0x000000, 0xf478a0, + 0x11c5e3, 0x2ea74d, 0x11c5e3, 0xf478a0, 0x000000, + 0x055f81, 0xfc63f8, 0xff5c0e, 0x028f41, 0xfebd93, + 0xff5223, 0x010aef, 0xffc87d, 0xffa818, 0x003c12, + 0x0002c1, 0xfff004, 0x00050a, +}; + +static const unsigned int lpf3_default_tap[] = { + 0x000000, 0x000081, 0x000000, 0xfffedb, 0x000000, + 0x00022d, 0x000000, 0xfffc46, 0x000000, 0x0005f7, + 0x000000, 0xfff6eb, 0x000000, 0x000d4e, 0x000000, + 0xffed1e, 0x000000, 0x001a1c, 0x000000, 0xffdcb0, + 0x000000, 0x002ede, 0x000000, 0xffc2d1, 0x000000, + 0x004ebe, 0x000000, 0xff9beb, 0x000000, 0x007dd7, + 0x000000, 0xff633a, 0x000000, 0x00c1d2, 0x000000, + 0xff11d5, 0x000000, 0x012368, 0x000000, 0xfe9c45, + 0x000000, 0x01b252, 0x000000, 0xfdebf6, 0x000000, + 0x0290b8, 0x000000, 0xfcca0d, 0x000000, 0x041d7c, + 0x000000, 0xfa8152, 0x000000, 0x07e9c6, 0x000000, + 0xf28fb5, 0x000000, 0x28b216, 0x3fffde, 0x28b216, + 0x000000, 0xf28fb5, 0x000000, 0x07e9c6, 0x000000, + 0xfa8152, 0x000000, 0x041d7c, 0x000000, 0xfcca0d, + 0x000000, 0x0290b8, 0x000000, 0xfdebf6, 0x000000, + 0x01b252, 0x000000, 0xfe9c45, 0x000000, 0x012368, + 0x000000, 0xff11d5, 0x000000, 0x00c1d2, 0x000000, + 0xff633a, 0x000000, 0x007dd7, 0x000000, 0xff9beb, + 0x000000, 0x004ebe, 0x000000, 0xffc2d1, 0x000000, + 0x002ede, 0x000000, 0xffdcb0, 0x000000, 0x001a1c, + 0x000000, 0xffed1e, 0x000000, 0x000d4e, 0x000000, + 0xfff6eb, 0x000000, 0x0005f7, 0x000000, 0xfffc46, + 0x000000, 0x00022d, 0x000000, 0xfffedb, 0x000000, + 0x000081, 0x000000, +}; + +/* + * These values are sane defaults for the axg platform: + * - OS = 64 + * - Latency = 38700 (?) + * + * TODO: There is a lot of different HCIC, LPFs and HPF configurations possible. + * the configuration may depend on the dmic used by the platform, the + * expected tradeoff between latency and quality, etc ... If/When other + * settings are required, we should add a fw interface to this driver to + * load new filter settings. + */ +static const struct axg_pdm_filters axg_default_filters = { + .hcic = { + .shift = 0x15, + .mult = 0x80, + .steps = 7, + .ds = 8, + }, + .hpf = { + .out_factor = 0x8000, + .steps = 13, + }, + .lpf = { + [0] = { + .ds = 2, + .round_mode = 1, + .tap = lpf1_default_tap, + .tap_num = ARRAY_SIZE(lpf1_default_tap), + }, + [1] = { + .ds = 2, + .round_mode = 0, + .tap = lpf2_default_tap, + .tap_num = ARRAY_SIZE(lpf2_default_tap), + }, + [2] = { + .ds = 2, + .round_mode = 1, + .tap = lpf3_default_tap, + .tap_num = ARRAY_SIZE(lpf3_default_tap) + }, + }, +}; + +static const struct axg_pdm_cfg axg_pdm_config = { + .filters = &axg_default_filters, + .sys_rate = 250000000, +}; + +static const struct of_device_id axg_pdm_of_match[] = { + { + .compatible = "amlogic,axg-pdm", + .data = &axg_pdm_config, + }, {} +}; +MODULE_DEVICE_TABLE(of, axg_pdm_of_match); + +static int axg_pdm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct axg_pdm *priv; + struct resource *res; + void __iomem *regs; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + platform_set_drvdata(pdev, priv); + + priv->cfg = of_device_get_match_data(dev); + if (!priv->cfg) { + dev_err(dev, "failed to match device\n"); + return -ENODEV; + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + regs = devm_ioremap_resource(dev, res); + if (IS_ERR(regs)) + return PTR_ERR(regs); + + priv->map = devm_regmap_init_mmio(dev, regs, &axg_pdm_regmap_cfg); + if (IS_ERR(priv->map)) { + dev_err(dev, "failed to init regmap: %ld\n", + PTR_ERR(priv->map)); + return PTR_ERR(priv->map); + } + + priv->pclk = devm_clk_get(dev, "pclk"); + if (IS_ERR(priv->pclk)) { + ret = PTR_ERR(priv->pclk); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get pclk: %d\n", ret); + return ret; + } + + priv->dclk = devm_clk_get(dev, "dclk"); + if (IS_ERR(priv->dclk)) { + ret = PTR_ERR(priv->dclk); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get dclk: %d\n", ret); + return ret; + } + + priv->sysclk = devm_clk_get(dev, "sysclk"); + if (IS_ERR(priv->sysclk)) { + ret = PTR_ERR(priv->sysclk); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get dclk: %d\n", ret); + return ret; + } + + return devm_snd_soc_register_component(dev, &axg_pdm_component_drv, + &axg_pdm_dai_drv, 1); +} + +static struct platform_driver axg_pdm_pdrv = { + .probe = axg_pdm_probe, + .driver = { + .name = "axg-pdm", + .of_match_table = axg_pdm_of_match, + }, +}; +module_platform_driver(axg_pdm_pdrv); + +MODULE_DESCRIPTION("Amlogic AXG PDM Input driver"); +MODULE_AUTHOR("Jerome Brunet "); +MODULE_LICENSE("GPL v2"); diff --git a/sound/soc/meson/axg-tdm-interface.c b/sound/soc/meson/axg-tdm-interface.c index 7b8baf46d968..585ce030b79b 100644 --- a/sound/soc/meson/axg-tdm-interface.c +++ b/sound/soc/meson/axg-tdm-interface.c @@ -42,6 +42,7 @@ int axg_tdm_set_tdm_slots(struct snd_soc_dai *dai, u32 *tx_mask, struct axg_tdm_stream *rx = (struct axg_tdm_stream *) dai->capture_dma_data; unsigned int tx_slots, rx_slots; + unsigned int fmt = 0; tx_slots = axg_tdm_slots_total(tx_mask); rx_slots = axg_tdm_slots_total(rx_mask); @@ -52,38 +53,45 @@ int axg_tdm_set_tdm_slots(struct snd_soc_dai *dai, u32 *tx_mask, return -EINVAL; } - /* - * Amend the dai driver channel number and let dpcm channel merge do - * its job - */ - if (tx) { - tx->mask = tx_mask; - dai->driver->playback.channels_max = tx_slots; - } - - if (rx) { - rx->mask = rx_mask; - dai->driver->capture.channels_max = rx_slots; - } - iface->slots = slots; switch (slot_width) { case 0: - /* defaults width to 32 if not provided */ - iface->slot_width = 32; - break; - case 8: - case 16: - case 24: + slot_width = 32; + /* Fall-through */ case 32: - iface->slot_width = slot_width; + fmt |= SNDRV_PCM_FMTBIT_S32_LE; + /* Fall-through */ + case 24: + fmt |= SNDRV_PCM_FMTBIT_S24_LE; + fmt |= SNDRV_PCM_FMTBIT_S20_LE; + /* Fall-through */ + case 16: + fmt |= SNDRV_PCM_FMTBIT_S16_LE; + /* Fall-through */ + case 8: + fmt |= SNDRV_PCM_FMTBIT_S8; break; default: dev_err(dai->dev, "unsupported slot width: %d\n", slot_width); return -EINVAL; } + iface->slot_width = slot_width; + + /* Amend the dai driver and let dpcm merge do its job */ + if (tx) { + tx->mask = tx_mask; + dai->driver->playback.channels_max = tx_slots; + dai->driver->playback.formats = fmt; + } + + if (rx) { + rx->mask = rx_mask; + dai->driver->capture.channels_max = rx_slots; + dai->driver->capture.formats = fmt; + } + return 0; } EXPORT_SYMBOL_GPL(axg_tdm_set_tdm_slots); diff --git a/sound/soc/nuc900/nuc900-ac97.c b/sound/soc/nuc900/nuc900-ac97.c index 81b09d740ed9..6384bb6dacfd 100644 --- a/sound/soc/nuc900/nuc900-ac97.c +++ b/sound/soc/nuc900/nuc900-ac97.c @@ -356,7 +356,7 @@ static int nuc900_ac97_drvprobe(struct platform_device *pdev) if (ret) goto out; - ret = snd_soc_register_component(&pdev->dev, &nuc900_ac97_component, + ret = devm_snd_soc_register_component(&pdev->dev, &nuc900_ac97_component, &nuc900_ac97_dai, 1); if (ret) goto out; @@ -373,8 +373,6 @@ out: static int nuc900_ac97_drvremove(struct platform_device *pdev) { - snd_soc_unregister_component(&pdev->dev); - nuc900_ac97_data = NULL; snd_soc_set_ac97_ops(NULL); diff --git a/sound/soc/omap/omap-hdmi-audio.c b/sound/soc/omap/omap-hdmi-audio.c index 8a99a8837dc9..673a9eb153b2 100644 --- a/sound/soc/omap/omap-hdmi-audio.c +++ b/sound/soc/omap/omap-hdmi-audio.c @@ -348,7 +348,7 @@ static int omap_hdmi_audio_probe(struct platform_device *pdev) default: return -EINVAL; } - ret = snd_soc_register_component(ad->dssdev, &omap_hdmi_component, + ret = devm_snd_soc_register_component(ad->dssdev, &omap_hdmi_component, dai_drv, 1); if (ret) return ret; @@ -383,7 +383,6 @@ static int omap_hdmi_audio_probe(struct platform_device *pdev) ret = snd_soc_register_card(card); if (ret) { dev_err(dev, "snd_soc_register_card failed (%d)\n", ret); - snd_soc_unregister_component(ad->dssdev); return ret; } @@ -400,7 +399,6 @@ static int omap_hdmi_audio_remove(struct platform_device *pdev) struct hdmi_audio_data *ad = platform_get_drvdata(pdev); snd_soc_unregister_card(ad->card); - snd_soc_unregister_component(ad->dssdev); return 0; } diff --git a/sound/soc/pxa/Kconfig b/sound/soc/pxa/Kconfig index 776e148b0aa2..943b44de1464 100644 --- a/sound/soc/pxa/Kconfig +++ b/sound/soc/pxa/Kconfig @@ -19,14 +19,13 @@ config SND_MMP_SOC config SND_PXA2XX_AC97 tristate - select SND_AC97_CODEC config SND_PXA2XX_SOC_AC97 tristate - select AC97_BUS + select AC97_BUS_NEW select SND_PXA2XX_LIB select SND_PXA2XX_LIB_AC97 - select SND_SOC_AC97_BUS + select SND_SOC_AC97_BUS_NEW config SND_PXA2XX_SOC_I2S select SND_PXA2XX_LIB @@ -80,6 +79,7 @@ config SND_PXA2XX_SOC_TOSA tristate "SoC AC97 Audio support for Tosa" depends on SND_PXA2XX_SOC && MACH_TOSA depends on MFD_TC6393XB + depends on !AC97_BUS select SND_PXA2XX_SOC_AC97 select SND_SOC_WM9712 help @@ -89,6 +89,7 @@ config SND_PXA2XX_SOC_TOSA config SND_PXA2XX_SOC_E740 tristate "SoC AC97 Audio support for e740" depends on SND_PXA2XX_SOC && MACH_E740 + depends on !AC97_BUS select SND_SOC_WM9705 select SND_PXA2XX_SOC_AC97 help @@ -98,6 +99,7 @@ config SND_PXA2XX_SOC_E740 config SND_PXA2XX_SOC_E750 tristate "SoC AC97 Audio support for e750" depends on SND_PXA2XX_SOC && MACH_E750 + depends on !AC97_BUS select SND_SOC_WM9705 select SND_PXA2XX_SOC_AC97 help @@ -107,6 +109,7 @@ config SND_PXA2XX_SOC_E750 config SND_PXA2XX_SOC_E800 tristate "SoC AC97 Audio support for e800" depends on SND_PXA2XX_SOC && MACH_E800 + depends on !AC97_BUS select SND_SOC_WM9712 select SND_PXA2XX_SOC_AC97 help @@ -117,6 +120,7 @@ config SND_PXA2XX_SOC_EM_X270 tristate "SoC Audio support for CompuLab EM-x270, eXeda and CM-X300" depends on SND_PXA2XX_SOC && (MACH_EM_X270 || MACH_EXEDA || \ MACH_CM_X300) + depends on !AC97_BUS select SND_PXA2XX_SOC_AC97 select SND_SOC_WM9712 help @@ -127,6 +131,7 @@ config SND_PXA2XX_SOC_PALM27X bool "SoC Audio support for Palm T|X, T5, E2 and LifeDrive" depends on SND_PXA2XX_SOC && (MACH_PALMLD || MACH_PALMTX || \ MACH_PALMT5 || MACH_PALMTE2) + depends on !AC97_BUS select SND_PXA2XX_SOC_AC97 select SND_SOC_WM9712 help @@ -156,6 +161,7 @@ config SND_SOC_TTC_DKB config SND_SOC_ZYLONITE tristate "SoC Audio support for Marvell Zylonite" depends on SND_PXA2XX_SOC && MACH_ZYLONITE + depends on !AC97_BUS select SND_PXA2XX_SOC_AC97 select SND_PXA_SOC_SSP select SND_SOC_WM9713 @@ -195,6 +201,7 @@ config SND_PXA2XX_SOC_MAGICIAN config SND_PXA2XX_SOC_MIOA701 tristate "SoC Audio support for MIO A701" depends on SND_PXA2XX_SOC && MACH_MIOA701 + depends on !AC97_BUS select SND_PXA2XX_SOC_AC97 select SND_SOC_WM9713 help diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c index 69033e1a84e6..adcf8ba9d287 100644 --- a/sound/soc/pxa/pxa-ssp.c +++ b/sound/soc/pxa/pxa-ssp.c @@ -103,6 +103,9 @@ static int pxa_ssp_startup(struct snd_pcm_substream *substream, pxa_ssp_disable(ssp); } + if (priv->extclk) + clk_prepare_enable(priv->extclk); + dma = kzalloc(sizeof(struct snd_dmaengine_dai_dma_data), GFP_KERNEL); if (!dma) return -ENOMEM; @@ -125,6 +128,9 @@ static void pxa_ssp_shutdown(struct snd_pcm_substream *substream, clk_disable_unprepare(ssp->clk); } + if (priv->extclk) + clk_disable_unprepare(priv->extclk); + kfree(snd_soc_dai_get_dma_data(cpu_dai, substream)); snd_soc_dai_set_dma_data(cpu_dai, substream, NULL); } diff --git a/sound/soc/pxa/pxa2xx-ac97.c b/sound/soc/pxa/pxa2xx-ac97.c index 9f779657bc86..f8a3aa6c6d4e 100644 --- a/sound/soc/pxa/pxa2xx-ac97.c +++ b/sound/soc/pxa/pxa2xx-ac97.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -27,43 +28,35 @@ #include #include -static void pxa2xx_ac97_warm_reset(struct snd_ac97 *ac97) +static void pxa2xx_ac97_warm_reset(struct ac97_controller *adrv) { pxa2xx_ac97_try_warm_reset(); pxa2xx_ac97_finish_reset(); } -static void pxa2xx_ac97_cold_reset(struct snd_ac97 *ac97) +static void pxa2xx_ac97_cold_reset(struct ac97_controller *adrv) { pxa2xx_ac97_try_cold_reset(); pxa2xx_ac97_finish_reset(); } -static unsigned short pxa2xx_ac97_legacy_read(struct snd_ac97 *ac97, - unsigned short reg) +static int pxa2xx_ac97_read_actrl(struct ac97_controller *adrv, int slot, + unsigned short reg) { - int ret; - - ret = pxa2xx_ac97_read(ac97->num, reg); - if (ret < 0) - return 0; - else - return (unsigned short)(ret & 0xffff); + return pxa2xx_ac97_read(slot, reg); } -static void pxa2xx_ac97_legacy_write(struct snd_ac97 *ac97, - unsigned short reg, unsigned short val) +static int pxa2xx_ac97_write_actrl(struct ac97_controller *adrv, int slot, + unsigned short reg, unsigned short val) { - int ret; - - ret = pxa2xx_ac97_write(ac97->num, reg, val); + return pxa2xx_ac97_write(slot, reg, val); } -static struct snd_ac97_bus_ops pxa2xx_ac97_ops = { - .read = pxa2xx_ac97_legacy_read, - .write = pxa2xx_ac97_legacy_write, +static struct ac97_controller_ops pxa2xx_ac97_ops = { + .read = pxa2xx_ac97_read_actrl, + .write = pxa2xx_ac97_write_actrl, .warm_reset = pxa2xx_ac97_warm_reset, .reset = pxa2xx_ac97_cold_reset, }; @@ -233,6 +226,9 @@ MODULE_DEVICE_TABLE(of, pxa2xx_ac97_dt_ids); static int pxa2xx_ac97_dev_probe(struct platform_device *pdev) { int ret; + struct ac97_controller *ctrl; + pxa2xx_audio_ops_t *pdata = pdev->dev.platform_data; + void **codecs_pdata; if (pdev->id != -1) { dev_err(&pdev->dev, "PXA2xx has only one AC97 port.\n"); @@ -245,10 +241,14 @@ static int pxa2xx_ac97_dev_probe(struct platform_device *pdev) return ret; } - ret = snd_soc_set_ac97_ops(&pxa2xx_ac97_ops); - if (ret != 0) - return ret; + codecs_pdata = pdata ? pdata->codec_pdata : NULL; + ctrl = snd_ac97_controller_register(&pxa2xx_ac97_ops, &pdev->dev, + AC97_SLOTS_AVAILABLE_ALL, + codecs_pdata); + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + platform_set_drvdata(pdev, ctrl); /* Punt most of the init to the SoC probe; we may need the machine * driver to do interesting things with the clocking to get us up * and running. @@ -259,8 +259,10 @@ static int pxa2xx_ac97_dev_probe(struct platform_device *pdev) static int pxa2xx_ac97_dev_remove(struct platform_device *pdev) { + struct ac97_controller *ctrl = platform_get_drvdata(pdev); + snd_soc_unregister_component(&pdev->dev); - snd_soc_set_ac97_ops(NULL); + snd_ac97_controller_unregister(ctrl); pxa2xx_ac97_hw_remove(pdev); return 0; } diff --git a/sound/soc/qcom/apq8096.c b/sound/soc/qcom/apq8096.c index 1543e85629f8..fb45f396ab4a 100644 --- a/sound/soc/qcom/apq8096.c +++ b/sound/soc/qcom/apq8096.c @@ -25,13 +25,12 @@ static int apq8096_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, static void apq8096_add_be_ops(struct snd_soc_card *card) { - struct snd_soc_dai_link *link = card->dai_link; - int i, num_links = card->num_links; + struct snd_soc_dai_link *link; + int i; - for (i = 0; i < num_links; i++) { + for_each_card_prelinks(card, i, link) { if (link->no_pcm == 1) link->be_hw_params_fixup = apq8096_be_hw_params_fixup; - link++; } } diff --git a/sound/soc/qcom/qdsp6/q6adm.c b/sound/soc/qcom/qdsp6/q6adm.c index 932c3ebfd252..da242515e146 100644 --- a/sound/soc/qcom/qdsp6/q6adm.c +++ b/sound/soc/qcom/qdsp6/q6adm.c @@ -2,25 +2,24 @@ // Copyright (c) 2011-2017, The Linux Foundation. All rights reserved. // Copyright (c) 2018, Linaro Limited -#include -#include -#include #include -#include -#include #include +#include +#include +#include #include #include -#include -#include -#include #include +#include +#include +#include +#include #include #include "q6adm.h" #include "q6afe.h" #include "q6core.h" -#include "q6dsp-errno.h" #include "q6dsp-common.h" +#include "q6dsp-errno.h" #define ADM_CMD_DEVICE_OPEN_V5 0x00010326 #define ADM_CMDRSP_DEVICE_OPEN_V5 0x00010329 diff --git a/sound/soc/qcom/qdsp6/q6asm-dai.c b/sound/soc/qcom/qdsp6/q6asm-dai.c index 9db9a2944ef2..a16c71c03058 100644 --- a/sound/soc/qcom/qdsp6/q6asm-dai.c +++ b/sound/soc/qcom/qdsp6/q6asm-dai.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -319,10 +318,11 @@ static int q6asm_dai_open(struct snd_pcm_substream *substream) prtd->audio_client = q6asm_audio_client_alloc(dev, (q6asm_cb)event_handler, prtd, stream_id, LEGACY_PCM_MODE); - if (!prtd->audio_client) { + if (IS_ERR(prtd->audio_client)) { pr_info("%s: Could not allocate memory\n", __func__); + ret = PTR_ERR(prtd->audio_client); kfree(prtd); - return -ENOMEM; + return ret; } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) @@ -493,7 +493,7 @@ static int q6asm_dai_pcm_new(struct snd_soc_pcm_runtime *rtd) } } - return ret; + return 0; } static void q6asm_dai_pcm_free(struct snd_pcm *pcm) diff --git a/sound/soc/qcom/qdsp6/q6asm.c b/sound/soc/qcom/qdsp6/q6asm.c index 2b2c7233bb5f..e1cfa846a1dc 100644 --- a/sound/soc/qcom/qdsp6/q6asm.c +++ b/sound/soc/qcom/qdsp6/q6asm.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/soc/qcom/qdsp6/q6core.c b/sound/soc/qcom/qdsp6/q6core.c index 06f03a5fe9bd..cdfc8ab6cfc0 100644 --- a/sound/soc/qcom/qdsp6/q6core.c +++ b/sound/soc/qcom/qdsp6/q6core.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include "q6core.h" #include "q6dsp-errno.h" @@ -105,12 +104,10 @@ static int q6core_callback(struct apr_device *adev, struct apr_resp_pkt *data) bytes = sizeof(*fwk) + fwk->num_services * sizeof(fwk->svc_api_info[0]); - core->fwk_version = kzalloc(bytes, GFP_ATOMIC); + core->fwk_version = kmemdup(data->payload, bytes, GFP_ATOMIC); if (!core->fwk_version) return -ENOMEM; - memcpy(core->fwk_version, data->payload, bytes); - core->fwk_version_supported = true; core->resp_received = true; @@ -124,12 +121,10 @@ static int q6core_callback(struct apr_device *adev, struct apr_resp_pkt *data) len = sizeof(*v) + v->num_services * sizeof(v->svc_api_info[0]); - core->svc_version = kzalloc(len, GFP_ATOMIC); + core->svc_version = kmemdup(data->payload, len, GFP_ATOMIC); if (!core->svc_version) return -ENOMEM; - memcpy(core->svc_version, data->payload, len); - core->get_version_supported = true; core->resp_received = true; diff --git a/sound/soc/qcom/sdm845.c b/sound/soc/qcom/sdm845.c index 2a781d87ee65..9effbecc571f 100644 --- a/sound/soc/qcom/sdm845.c +++ b/sound/soc/qcom/sdm845.c @@ -195,15 +195,14 @@ static int sdm845_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, static void sdm845_add_be_ops(struct snd_soc_card *card) { - struct snd_soc_dai_link *link = card->dai_link; - int i, num_links = card->num_links; + struct snd_soc_dai_link *link; + int i; - for (i = 0; i < num_links; i++) { + for_each_card_prelinks(card, i, link) { if (link->no_pcm == 1) { link->ops = &sdm845_be_ops; link->be_hw_params_fixup = sdm845_be_hw_params_fixup; } - link++; } } diff --git a/sound/soc/rockchip/rk3288_hdmi_analog.c b/sound/soc/rockchip/rk3288_hdmi_analog.c index 929b3fe289b0..a472d5eb2950 100644 --- a/sound/soc/rockchip/rk3288_hdmi_analog.c +++ b/sound/soc/rockchip/rk3288_hdmi_analog.c @@ -286,7 +286,6 @@ static struct platform_driver rockchip_sound_driver = { .probe = snd_rk_mc_probe, .driver = { .name = DRV_NAME, - .owner = THIS_MODULE, .pm = &snd_soc_pm_ops, .of_match_table = rockchip_sound_of_match, }, diff --git a/sound/soc/rockchip/rockchip_pcm.c b/sound/soc/rockchip/rockchip_pcm.c index f77538319221..9e7b5fa4cf59 100644 --- a/sound/soc/rockchip/rockchip_pcm.c +++ b/sound/soc/rockchip/rockchip_pcm.c @@ -21,7 +21,8 @@ static const struct snd_pcm_hardware snd_rockchip_hardware = { .info = SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | - SNDRV_PCM_INFO_RESUME, + SNDRV_PCM_INFO_RESUME | + SNDRV_PCM_INFO_INTERLEAVED, .period_bytes_min = 32, .period_bytes_max = 8192, .periods_min = 1, diff --git a/sound/soc/samsung/tm2_wm5110.c b/sound/soc/samsung/tm2_wm5110.c index 43332c32d7e9..dc93941e01c3 100644 --- a/sound/soc/samsung/tm2_wm5110.c +++ b/sound/soc/samsung/tm2_wm5110.c @@ -491,6 +491,7 @@ static int tm2_probe(struct platform_device *pdev) struct snd_soc_card *card = &tm2_card; struct tm2_machine_priv *priv; struct of_phandle_args args; + struct snd_soc_dai_link *dai_link; int num_codecs, ret, i; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -558,18 +559,18 @@ static int tm2_probe(struct platform_device *pdev) } /* Initialize WM5110 - I2S and HDMI - I2S1 DAI links */ - for (i = 0; i < card->num_links; i++) { + for_each_card_prelinks(card, i, dai_link) { unsigned int dai_index = 0; /* WM5110 */ - card->dai_link[i].cpu_name = NULL; - card->dai_link[i].platform_name = NULL; + dai_link->cpu_name = NULL; + dai_link->platform_name = NULL; if (num_codecs > 1 && i == card->num_links - 1) dai_index = 1; /* HDMI */ - card->dai_link[i].codec_of_node = codec_dai_node[dai_index]; - card->dai_link[i].cpu_of_node = cpu_dai_node[dai_index]; - card->dai_link[i].platform_of_node = cpu_dai_node[dai_index]; + dai_link->codec_of_node = codec_dai_node[dai_index]; + dai_link->cpu_of_node = cpu_dai_node[dai_index]; + dai_link->platform_of_node = cpu_dai_node[dai_index]; } if (num_codecs > 1) { diff --git a/sound/soc/sh/hac.c b/sound/soc/sh/hac.c index c2b496398e6b..17622ceb98c0 100644 --- a/sound/soc/sh/hac.c +++ b/sound/soc/sh/hac.c @@ -319,13 +319,12 @@ static int hac_soc_platform_probe(struct platform_device *pdev) if (ret != 0) return ret; - return snd_soc_register_component(&pdev->dev, &sh4_hac_component, + return devm_snd_soc_register_component(&pdev->dev, &sh4_hac_component, sh4_hac_dai, ARRAY_SIZE(sh4_hac_dai)); } static int hac_soc_platform_remove(struct platform_device *pdev) { - snd_soc_unregister_component(&pdev->dev); snd_soc_set_ac97_ops(NULL); return 0; } diff --git a/sound/soc/sh/rcar/adg.c b/sound/soc/sh/rcar/adg.c index 051f96405346..28327dd2c6cb 100644 --- a/sound/soc/sh/rcar/adg.c +++ b/sound/soc/sh/rcar/adg.c @@ -582,7 +582,7 @@ static void rsnd_adg_clk_dbg_info(struct rsnd_priv *priv, struct rsnd_adg *adg) int i; for_each_rsnd_clk(clk, adg, i) - dev_dbg(dev, "%s : %p : %ld\n", + dev_dbg(dev, "%s : %pa : %ld\n", clk_name[i], clk, clk_get_rate(clk)); dev_dbg(dev, "BRGCKR = 0x%08x, BRRA/BRRB = 0x%x/0x%x\n", @@ -595,7 +595,7 @@ static void rsnd_adg_clk_dbg_info(struct rsnd_priv *priv, struct rsnd_adg *adg) * by BRGCKR::BRGCKR_31 */ for_each_rsnd_clkout(clk, adg, i) - dev_dbg(dev, "clkout %d : %p : %ld\n", i, + dev_dbg(dev, "clkout %d : %pa : %ld\n", i, clk, clk_get_rate(clk)); } #else diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index d23c2bbff0cf..f930f51b686f 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -102,7 +102,9 @@ #include "rsnd.h" #define RSND_RATES SNDRV_PCM_RATE_8000_192000 -#define RSND_FMTS (SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S16_LE) +#define RSND_FMTS (SNDRV_PCM_FMTBIT_S8 |\ + SNDRV_PCM_FMTBIT_S16_LE |\ + SNDRV_PCM_FMTBIT_S24_LE) static const struct of_device_id rsnd_of_match[] = { { .compatible = "renesas,rcar_sound-gen1", .data = (void *)RSND_GEN1 }, @@ -280,6 +282,8 @@ u32 rsnd_get_adinr_bit(struct rsnd_mod *mod, struct rsnd_dai_stream *io) struct device *dev = rsnd_priv_to_dev(priv); switch (snd_pcm_format_width(runtime->format)) { + case 8: + return 16 << 16; case 16: return 8 << 16; case 24: @@ -331,7 +335,7 @@ u32 rsnd_get_dalign(struct rsnd_mod *mod, struct rsnd_dai_stream *io) target = cmd ? cmd : ssiu; } - /* Non target mod or 24bit data needs normal DALIGN */ + /* Non target mod or non 16bit needs normal DALIGN */ if ((snd_pcm_format_width(runtime->format) != 16) || (mod != target)) return 0x76543210; @@ -367,7 +371,7 @@ u32 rsnd_get_busif_shift(struct rsnd_dai_stream *io, struct rsnd_mod *mod) * HW 24bit data is located as 0x******00 * */ - if (snd_pcm_format_width(runtime->format) == 16) + if (snd_pcm_format_width(runtime->format) != 24) return 0; for (i = 0; i < ARRAY_SIZE(playback_mods); i++) { @@ -540,6 +544,14 @@ int rsnd_rdai_ssi_lane_ctrl(struct rsnd_dai *rdai, return rdai->ssi_lane; } +int rsnd_rdai_width_ctrl(struct rsnd_dai *rdai, int width) +{ + if (width > 0) + rdai->chan_width = width; + + return rdai->chan_width; +} + struct rsnd_dai *rsnd_rdai_get(struct rsnd_priv *priv, int id) { if ((id < 0) || (id >= rsnd_rdai_nr(priv))) @@ -681,6 +693,7 @@ static int rsnd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) rdai->frm_clk_inv = 0; break; case SND_SOC_DAIFMT_LEFT_J: + case SND_SOC_DAIFMT_DSP_B: rdai->sys_delay = 1; rdai->data_alignment = 0; rdai->frm_clk_inv = 1; @@ -690,6 +703,11 @@ static int rsnd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) rdai->data_alignment = 1; rdai->frm_clk_inv = 1; break; + case SND_SOC_DAIFMT_DSP_A: + rdai->sys_delay = 0; + rdai->data_alignment = 0; + rdai->frm_clk_inv = 1; + break; } /* set clock inversion */ @@ -720,6 +738,16 @@ static int rsnd_soc_set_dai_tdm_slot(struct snd_soc_dai *dai, struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai); struct device *dev = rsnd_priv_to_dev(priv); + switch (slot_width) { + case 16: + case 24: + case 32: + break; + default: + /* use default */ + slot_width = 32; + } + switch (slots) { case 2: case 6: @@ -727,6 +755,7 @@ static int rsnd_soc_set_dai_tdm_slot(struct snd_soc_dai *dai, /* TDM Extend Mode */ rsnd_rdai_channels_set(rdai, slots); rsnd_rdai_ssi_lane_set(rdai, 1); + rsnd_rdai_width_set(rdai, slot_width); break; default: dev_err(dev, "unsupported TDM slots (%d)\n", slots); @@ -755,7 +784,7 @@ static unsigned int rsnd_soc_hw_rate_list[] = { 192000, }; -static int rsnd_soc_hw_rule(struct rsnd_priv *priv, +static int rsnd_soc_hw_rule(struct rsnd_dai *rdai, unsigned int *list, int list_num, struct snd_interval *baseline, struct snd_interval *iv) { @@ -772,14 +801,14 @@ static int rsnd_soc_hw_rule(struct rsnd_priv *priv, if (!snd_interval_test(iv, list[i])) continue; - rate = rsnd_ssi_clk_query(priv, + rate = rsnd_ssi_clk_query(rdai, baseline->min, list[i], NULL); if (rate > 0) { p.min = min(p.min, list[i]); p.max = max(p.max, list[i]); } - rate = rsnd_ssi_clk_query(priv, + rate = rsnd_ssi_clk_query(rdai, baseline->max, list[i], NULL); if (rate > 0) { p.min = min(p.min, list[i]); @@ -790,17 +819,14 @@ static int rsnd_soc_hw_rule(struct rsnd_priv *priv, return snd_interval_refine(iv, &p); } -static int __rsnd_soc_hw_rule_rate(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule, - int is_play) +static int rsnd_soc_hw_rule_rate(struct snd_pcm_hw_params *params, + struct snd_pcm_hw_rule *rule) { struct snd_interval *ic_ = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); struct snd_interval *ir = hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); struct snd_interval ic; - struct snd_soc_dai *dai = rule->private; - struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai); - struct rsnd_priv *priv = rsnd_rdai_to_priv(rdai); - struct rsnd_dai_stream *io = is_play ? &rdai->playback : &rdai->capture; + struct rsnd_dai_stream *io = rule->private; + struct rsnd_dai *rdai = rsnd_io_to_rdai(io); /* * possible sampling rate limitation is same as @@ -811,34 +837,19 @@ static int __rsnd_soc_hw_rule_rate(struct snd_pcm_hw_params *params, ic.min = ic.max = rsnd_runtime_channel_for_ssi_with_params(io, params); - return rsnd_soc_hw_rule(priv, rsnd_soc_hw_rate_list, + return rsnd_soc_hw_rule(rdai, rsnd_soc_hw_rate_list, ARRAY_SIZE(rsnd_soc_hw_rate_list), &ic, ir); } -static int rsnd_soc_hw_rule_rate_playback(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule) -{ - return __rsnd_soc_hw_rule_rate(params, rule, 1); -} - -static int rsnd_soc_hw_rule_rate_capture(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule) -{ - return __rsnd_soc_hw_rule_rate(params, rule, 0); -} - -static int __rsnd_soc_hw_rule_channels(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule, - int is_play) +static int rsnd_soc_hw_rule_channels(struct snd_pcm_hw_params *params, + struct snd_pcm_hw_rule *rule) { struct snd_interval *ic_ = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); struct snd_interval *ir = hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); struct snd_interval ic; - struct snd_soc_dai *dai = rule->private; - struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai); - struct rsnd_priv *priv = rsnd_rdai_to_priv(rdai); - struct rsnd_dai_stream *io = is_play ? &rdai->playback : &rdai->capture; + struct rsnd_dai_stream *io = rule->private; + struct rsnd_dai *rdai = rsnd_io_to_rdai(io); /* * possible sampling rate limitation is same as @@ -849,23 +860,11 @@ static int __rsnd_soc_hw_rule_channels(struct snd_pcm_hw_params *params, ic.min = ic.max = rsnd_runtime_channel_for_ssi_with_params(io, params); - return rsnd_soc_hw_rule(priv, rsnd_soc_hw_channels_list, + return rsnd_soc_hw_rule(rdai, rsnd_soc_hw_channels_list, ARRAY_SIZE(rsnd_soc_hw_channels_list), ir, &ic); } -static int rsnd_soc_hw_rule_channels_playback(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule) -{ - return __rsnd_soc_hw_rule_channels(params, rule, 1); -} - -static int rsnd_soc_hw_rule_channels_capture(struct snd_pcm_hw_params *params, - struct snd_pcm_hw_rule *rule) -{ - return __rsnd_soc_hw_rule_channels(params, rule, 0); -} - static const struct snd_pcm_hardware rsnd_pcm_hardware = { .info = SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_MMAP | @@ -882,12 +881,10 @@ static int rsnd_soc_dai_startup(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai); - struct rsnd_priv *priv = rsnd_rdai_to_priv(rdai); struct rsnd_dai_stream *io = rsnd_rdai_to_io(rdai, substream); struct snd_pcm_hw_constraint_list *constraint = &rdai->constraint; struct snd_pcm_runtime *runtime = substream->runtime; unsigned int max_channels = rsnd_rdai_channels_get(rdai); - int ret; int i; rsnd_dai_stream_init(io, substream); @@ -922,25 +919,16 @@ static int rsnd_soc_dai_startup(struct snd_pcm_substream *substream, int is_play = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, - is_play ? rsnd_soc_hw_rule_rate_playback : - rsnd_soc_hw_rule_rate_capture, - dai, + rsnd_soc_hw_rule_rate, + is_play ? &rdai->playback : &rdai->capture, SNDRV_PCM_HW_PARAM_CHANNELS, -1); snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, - is_play ? rsnd_soc_hw_rule_channels_playback : - rsnd_soc_hw_rule_channels_capture, - dai, + rsnd_soc_hw_rule_channels, + is_play ? &rdai->playback : &rdai->capture, SNDRV_PCM_HW_PARAM_RATE, -1); } - /* - * call rsnd_dai_call without spinlock - */ - ret = rsnd_dai_call(nolock_start, io, priv); - if (ret < 0) - rsnd_dai_call(nolock_stop, io, priv); - - return ret; + return 0; } static void rsnd_soc_dai_shutdown(struct snd_pcm_substream *substream, @@ -953,7 +941,7 @@ static void rsnd_soc_dai_shutdown(struct snd_pcm_substream *substream, /* * call rsnd_dai_call without spinlock */ - rsnd_dai_call(nolock_stop, io, priv); + rsnd_dai_call(cleanup, io, priv); rsnd_dai_stream_quit(io); } @@ -1083,6 +1071,7 @@ static void __rsnd_dai_probe(struct rsnd_priv *priv, rdai->capture.rdai = rdai; rsnd_rdai_channels_set(rdai, 2); /* default 2ch */ rsnd_rdai_ssi_lane_set(rdai, 1); /* default 1lane */ + rsnd_rdai_width_set(rdai, 32); /* default 32bit width */ for (io_i = 0;; io_i++) { playback = of_parse_phandle(dai_np, "playback", io_i); @@ -1274,8 +1263,15 @@ int rsnd_kctrl_accept_anytime(struct rsnd_dai_stream *io) int rsnd_kctrl_accept_runtime(struct rsnd_dai_stream *io) { struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); + struct rsnd_priv *priv = rsnd_io_to_priv(io); + struct device *dev = rsnd_priv_to_dev(priv); + + if (!runtime) { + dev_warn(dev, "Can't update kctrl when idle\n"); + return 0; + } - return !!runtime; + return 1; } struct rsnd_kctrl_cfg *rsnd_kctrl_init_m(struct rsnd_kctrl_cfg_m *cfg) diff --git a/sound/soc/sh/rcar/ctu.c b/sound/soc/sh/rcar/ctu.c index 6a55aa753003..ad702377a6c3 100644 --- a/sound/soc/sh/rcar/ctu.c +++ b/sound/soc/sh/rcar/ctu.c @@ -258,7 +258,7 @@ static int rsnd_ctu_hw_params(struct rsnd_mod *mod, struct snd_pcm_hw_params *be_params; int stream = substream->stream; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { be_params = &dpcm->hw_params; if (params_channels(fe_params) != params_channels(be_params)) ctu->channels = params_channels(be_params); diff --git a/sound/soc/sh/rcar/dma.c b/sound/soc/sh/rcar/dma.c index d65ea7bc4dac..6d1947515dc8 100644 --- a/sound/soc/sh/rcar/dma.c +++ b/sound/soc/sh/rcar/dma.c @@ -106,9 +106,9 @@ static int rsnd_dmaen_stop(struct rsnd_mod *mod, return 0; } -static int rsnd_dmaen_nolock_stop(struct rsnd_mod *mod, - struct rsnd_dai_stream *io, - struct rsnd_priv *priv) +static int rsnd_dmaen_cleanup(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv) { struct rsnd_dma *dma = rsnd_mod_to_dma(mod); struct rsnd_dmaen *dmaen = rsnd_dma_to_dmaen(dma); @@ -116,7 +116,7 @@ static int rsnd_dmaen_nolock_stop(struct rsnd_mod *mod, /* * DMAEngine release uses mutex lock. * Thus, it shouldn't be called under spinlock. - * Let's call it under nolock_start + * Let's call it under prepare */ if (dmaen->chan) dma_release_channel(dmaen->chan); @@ -126,23 +126,22 @@ static int rsnd_dmaen_nolock_stop(struct rsnd_mod *mod, return 0; } -static int rsnd_dmaen_nolock_start(struct rsnd_mod *mod, - struct rsnd_dai_stream *io, - struct rsnd_priv *priv) +static int rsnd_dmaen_prepare(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv) { struct rsnd_dma *dma = rsnd_mod_to_dma(mod); struct rsnd_dmaen *dmaen = rsnd_dma_to_dmaen(dma); struct device *dev = rsnd_priv_to_dev(priv); - if (dmaen->chan) { - dev_err(dev, "it already has dma channel\n"); - return -EIO; - } + /* maybe suspended */ + if (dmaen->chan) + return 0; /* * DMAEngine request uses mutex lock. * Thus, it shouldn't be called under spinlock. - * Let's call it under nolock_start + * Let's call it under prepare */ dmaen->chan = rsnd_dmaen_request_channel(io, dma->mod_from, @@ -291,8 +290,8 @@ static int rsnd_dmaen_pointer(struct rsnd_mod *mod, static struct rsnd_mod_ops rsnd_dmaen_ops = { .name = "audmac", - .nolock_start = rsnd_dmaen_nolock_start, - .nolock_stop = rsnd_dmaen_nolock_stop, + .prepare = rsnd_dmaen_prepare, + .cleanup = rsnd_dmaen_cleanup, .start = rsnd_dmaen_start, .stop = rsnd_dmaen_stop, .pointer= rsnd_dmaen_pointer, @@ -302,16 +301,26 @@ static struct rsnd_mod_ops rsnd_dmaen_ops = { * Audio DMAC peri peri */ static const u8 gen2_id_table_ssiu[] = { - 0x00, /* SSI00 */ - 0x04, /* SSI10 */ - 0x08, /* SSI20 */ - 0x0c, /* SSI3 */ - 0x0d, /* SSI4 */ - 0x0e, /* SSI5 */ - 0x0f, /* SSI6 */ - 0x10, /* SSI7 */ - 0x11, /* SSI8 */ - 0x12, /* SSI90 */ + /* SSI00 ~ SSI07 */ + 0x00, 0x01, 0x02, 0x03, 0x39, 0x3a, 0x3b, 0x3c, + /* SSI10 ~ SSI17 */ + 0x04, 0x05, 0x06, 0x07, 0x3d, 0x3e, 0x3f, 0x40, + /* SSI20 ~ SSI27 */ + 0x08, 0x09, 0x0a, 0x0b, 0x41, 0x42, 0x43, 0x44, + /* SSI30 ~ SSI37 */ + 0x0c, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, + /* SSI40 ~ SSI47 */ + 0x0d, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, + /* SSI5 */ + 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* SSI6 */ + 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* SSI7 */ + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* SSI8 */ + 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* SSI90 ~ SSI97 */ + 0x12, 0x13, 0x14, 0x15, 0x53, 0x54, 0x55, 0x56, }; static const u8 gen2_id_table_scu[] = { 0x2d, /* SCU_SRCI0 */ @@ -337,18 +346,23 @@ static u32 rsnd_dmapp_get_id(struct rsnd_dai_stream *io, struct rsnd_mod *src = rsnd_io_to_mod_src(io); struct rsnd_mod *dvc = rsnd_io_to_mod_dvc(io); const u8 *entry = NULL; - int id = rsnd_mod_id(mod); + int id = 255; int size = 0; if (mod == ssi) { + int busif = rsnd_ssi_get_busif(io); + entry = gen2_id_table_ssiu; size = ARRAY_SIZE(gen2_id_table_ssiu); + id = (rsnd_mod_id(mod) * 8) + busif; } else if (mod == src) { entry = gen2_id_table_scu; size = ARRAY_SIZE(gen2_id_table_scu); + id = rsnd_mod_id(mod); } else if (mod == dvc) { entry = gen2_id_table_cmd; size = ARRAY_SIZE(gen2_id_table_cmd); + id = rsnd_mod_id(mod); } if ((!entry) || (size <= id)) { @@ -382,7 +396,7 @@ static void rsnd_dmapp_write(struct rsnd_dma *dma, u32 data, u32 reg) struct rsnd_dma_ctrl *dmac = rsnd_priv_to_dmac(priv); struct device *dev = rsnd_priv_to_dev(priv); - dev_dbg(dev, "w %p : %08x\n", rsnd_dmapp_addr(dmac, dma, reg), data); + dev_dbg(dev, "w 0x%px : %08x\n", rsnd_dmapp_addr(dmac, dma, reg), data); iowrite32(data, rsnd_dmapp_addr(dmac, dma, reg)); } @@ -491,11 +505,11 @@ static struct rsnd_mod_ops rsnd_dmapp_ops = { #define RDMA_SSI_I_N(addr, i) (addr ##_reg - 0x00300000 + (0x40 * i) + 0x8) #define RDMA_SSI_O_N(addr, i) (addr ##_reg - 0x00300000 + (0x40 * i) + 0xc) -#define RDMA_SSIU_I_N(addr, i) (addr ##_reg - 0x00441000 + (0x1000 * i)) -#define RDMA_SSIU_O_N(addr, i) (addr ##_reg - 0x00441000 + (0x1000 * i)) +#define RDMA_SSIU_I_N(addr, i, j) (addr ##_reg - 0x00441000 + (0x1000 * (i)) + (((j) / 4) * 0xA000) + (((j) % 4) * 0x400)) +#define RDMA_SSIU_O_N(addr, i, j) RDMA_SSIU_I_N(addr, i, j) -#define RDMA_SSIU_I_P(addr, i) (addr ##_reg - 0x00141000 + (0x1000 * i)) -#define RDMA_SSIU_O_P(addr, i) (addr ##_reg - 0x00141000 + (0x1000 * i)) +#define RDMA_SSIU_I_P(addr, i, j) (addr ##_reg - 0x00141000 + (0x1000 * (i)) + (((j) / 4) * 0xA000) + (((j) % 4) * 0x400)) +#define RDMA_SSIU_O_P(addr, i, j) RDMA_SSIU_I_P(addr, i, j) #define RDMA_SRC_I_N(addr, i) (addr ##_reg - 0x00500000 + (0x400 * i)) #define RDMA_SRC_O_N(addr, i) (addr ##_reg - 0x004fc000 + (0x400 * i)) @@ -521,6 +535,7 @@ rsnd_gen2_dma_addr(struct rsnd_dai_stream *io, !!rsnd_io_to_mod_mix(io) || !!rsnd_io_to_mod_ctu(io); int id = rsnd_mod_id(mod); + int busif = rsnd_ssi_get_busif(io); struct dma_addr { dma_addr_t out_addr; dma_addr_t in_addr; @@ -537,25 +552,35 @@ rsnd_gen2_dma_addr(struct rsnd_dai_stream *io, }, /* SSI */ /* Capture */ - {{{ RDMA_SSI_O_N(ssi, id), 0 }, - { RDMA_SSIU_O_P(ssi, id), 0 }, - { RDMA_SSIU_O_P(ssi, id), 0 } }, + {{{ RDMA_SSI_O_N(ssi, id), 0 }, + { RDMA_SSIU_O_P(ssi, id, busif), 0 }, + { RDMA_SSIU_O_P(ssi, id, busif), 0 } }, /* Playback */ - {{ 0, RDMA_SSI_I_N(ssi, id) }, - { 0, RDMA_SSIU_I_P(ssi, id) }, - { 0, RDMA_SSIU_I_P(ssi, id) } } + {{ 0, RDMA_SSI_I_N(ssi, id) }, + { 0, RDMA_SSIU_I_P(ssi, id, busif) }, + { 0, RDMA_SSIU_I_P(ssi, id, busif) } } }, /* SSIU */ /* Capture */ - {{{ RDMA_SSIU_O_N(ssi, id), 0 }, - { RDMA_SSIU_O_P(ssi, id), 0 }, - { RDMA_SSIU_O_P(ssi, id), 0 } }, + {{{ RDMA_SSIU_O_N(ssi, id, busif), 0 }, + { RDMA_SSIU_O_P(ssi, id, busif), 0 }, + { RDMA_SSIU_O_P(ssi, id, busif), 0 } }, /* Playback */ - {{ 0, RDMA_SSIU_I_N(ssi, id) }, - { 0, RDMA_SSIU_I_P(ssi, id) }, - { 0, RDMA_SSIU_I_P(ssi, id) } } }, + {{ 0, RDMA_SSIU_I_N(ssi, id, busif) }, + { 0, RDMA_SSIU_I_P(ssi, id, busif) }, + { 0, RDMA_SSIU_I_P(ssi, id, busif) } } }, }; + /* + * FIXME + * + * We can't support SSI9-4/5/6/7, because its address is + * out of calculation rule + */ + if ((id == 9) && (busif >= 4)) + dev_err(dev, "This driver doesn't support SSI%d-%d, so far", + id, busif); + /* it shouldn't happen */ if (use_cmd && !use_src) dev_err(dev, "DVC is selected without SRC\n"); diff --git a/sound/soc/sh/rcar/gen.c b/sound/soc/sh/rcar/gen.c index 0230301fe078..1f7881cc16b2 100644 --- a/sound/soc/sh/rcar/gen.c +++ b/sound/soc/sh/rcar/gen.c @@ -219,12 +219,33 @@ static int rsnd_gen2_probe(struct rsnd_priv *priv) RSND_GEN_S_REG(HDMI1_SEL, 0x9e4), /* FIXME: it needs SSI_MODE2/3 in the future */ - RSND_GEN_M_REG(SSI_BUSIF_MODE, 0x0, 0x80), - RSND_GEN_M_REG(SSI_BUSIF_ADINR, 0x4, 0x80), - RSND_GEN_M_REG(SSI_BUSIF_DALIGN,0x8, 0x80), - RSND_GEN_M_REG(SSI_MODE, 0xc, 0x80), - RSND_GEN_M_REG(SSI_CTRL, 0x10, 0x80), - RSND_GEN_M_REG(SSI_INT_ENABLE, 0x18, 0x80), + RSND_GEN_M_REG(SSI_BUSIF0_MODE, 0x0, 0x80), + RSND_GEN_M_REG(SSI_BUSIF0_ADINR, 0x4, 0x80), + RSND_GEN_M_REG(SSI_BUSIF0_DALIGN, 0x8, 0x80), + RSND_GEN_M_REG(SSI_BUSIF1_MODE, 0x20, 0x80), + RSND_GEN_M_REG(SSI_BUSIF1_ADINR, 0x24, 0x80), + RSND_GEN_M_REG(SSI_BUSIF1_DALIGN, 0x28, 0x80), + RSND_GEN_M_REG(SSI_BUSIF2_MODE, 0x40, 0x80), + RSND_GEN_M_REG(SSI_BUSIF2_ADINR, 0x44, 0x80), + RSND_GEN_M_REG(SSI_BUSIF2_DALIGN, 0x48, 0x80), + RSND_GEN_M_REG(SSI_BUSIF3_MODE, 0x60, 0x80), + RSND_GEN_M_REG(SSI_BUSIF3_ADINR, 0x64, 0x80), + RSND_GEN_M_REG(SSI_BUSIF3_DALIGN, 0x68, 0x80), + RSND_GEN_M_REG(SSI_BUSIF4_MODE, 0x500, 0x80), + RSND_GEN_M_REG(SSI_BUSIF4_ADINR, 0x504, 0x80), + RSND_GEN_M_REG(SSI_BUSIF4_DALIGN, 0x508, 0x80), + RSND_GEN_M_REG(SSI_BUSIF5_MODE, 0x520, 0x80), + RSND_GEN_M_REG(SSI_BUSIF5_ADINR, 0x524, 0x80), + RSND_GEN_M_REG(SSI_BUSIF5_DALIGN, 0x528, 0x80), + RSND_GEN_M_REG(SSI_BUSIF6_MODE, 0x540, 0x80), + RSND_GEN_M_REG(SSI_BUSIF6_ADINR, 0x544, 0x80), + RSND_GEN_M_REG(SSI_BUSIF6_DALIGN, 0x548, 0x80), + RSND_GEN_M_REG(SSI_BUSIF7_MODE, 0x560, 0x80), + RSND_GEN_M_REG(SSI_BUSIF7_ADINR, 0x564, 0x80), + RSND_GEN_M_REG(SSI_BUSIF7_DALIGN, 0x568, 0x80), + RSND_GEN_M_REG(SSI_MODE, 0xc, 0x80), + RSND_GEN_M_REG(SSI_CTRL, 0x10, 0x80), + RSND_GEN_M_REG(SSI_INT_ENABLE, 0x18, 0x80), }; static const struct rsnd_regmap_field_conf conf_scu[] = { diff --git a/sound/soc/sh/rcar/rsnd.h b/sound/soc/sh/rcar/rsnd.h index 8f7a0abfa751..4464d1d0a042 100644 --- a/sound/soc/sh/rcar/rsnd.h +++ b/sound/soc/sh/rcar/rsnd.h @@ -156,9 +156,30 @@ enum rsnd_reg { RSND_REG_SSI_MODE2, RSND_REG_SSI_CONTROL, RSND_REG_SSI_CTRL, - RSND_REG_SSI_BUSIF_MODE, - RSND_REG_SSI_BUSIF_ADINR, - RSND_REG_SSI_BUSIF_DALIGN, + RSND_REG_SSI_BUSIF0_MODE, + RSND_REG_SSI_BUSIF0_ADINR, + RSND_REG_SSI_BUSIF0_DALIGN, + RSND_REG_SSI_BUSIF1_MODE, + RSND_REG_SSI_BUSIF1_ADINR, + RSND_REG_SSI_BUSIF1_DALIGN, + RSND_REG_SSI_BUSIF2_MODE, + RSND_REG_SSI_BUSIF2_ADINR, + RSND_REG_SSI_BUSIF2_DALIGN, + RSND_REG_SSI_BUSIF3_MODE, + RSND_REG_SSI_BUSIF3_ADINR, + RSND_REG_SSI_BUSIF3_DALIGN, + RSND_REG_SSI_BUSIF4_MODE, + RSND_REG_SSI_BUSIF4_ADINR, + RSND_REG_SSI_BUSIF4_DALIGN, + RSND_REG_SSI_BUSIF5_MODE, + RSND_REG_SSI_BUSIF5_ADINR, + RSND_REG_SSI_BUSIF5_DALIGN, + RSND_REG_SSI_BUSIF6_MODE, + RSND_REG_SSI_BUSIF6_ADINR, + RSND_REG_SSI_BUSIF6_DALIGN, + RSND_REG_SSI_BUSIF7_MODE, + RSND_REG_SSI_BUSIF7_ADINR, + RSND_REG_SSI_BUSIF7_DALIGN, RSND_REG_SSI_INT_ENABLE, RSND_REG_SSI_SYS_STATUS0, RSND_REG_SSI_SYS_STATUS1, @@ -274,15 +295,12 @@ struct rsnd_mod_ops { int (*fallback)(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv); - int (*nolock_start)(struct rsnd_mod *mod, - struct rsnd_dai_stream *io, - struct rsnd_priv *priv); - int (*nolock_stop)(struct rsnd_mod *mod, - struct rsnd_dai_stream *io, - struct rsnd_priv *priv); int (*prepare)(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv); + int (*cleanup)(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv); }; struct rsnd_dai_stream; @@ -300,9 +318,8 @@ struct rsnd_mod { /* * status * - * 0xH0000CBA + * 0xH0000CB0 * - * A 0: nolock_start 1: nolock_stop * B 0: init 1: quit * C 0: start 1: stop * @@ -313,9 +330,8 @@ struct rsnd_mod { * H 0: hw_params * H 0: pointer * H 0: prepare + * H 0: cleanup */ -#define __rsnd_mod_shift_nolock_start 0 -#define __rsnd_mod_shift_nolock_stop 0 #define __rsnd_mod_shift_init 4 #define __rsnd_mod_shift_quit 4 #define __rsnd_mod_shift_start 8 @@ -328,11 +344,12 @@ struct rsnd_mod { #define __rsnd_mod_shift_hw_params 28 /* always called */ #define __rsnd_mod_shift_pointer 28 /* always called */ #define __rsnd_mod_shift_prepare 28 /* always called */ +#define __rsnd_mod_shift_cleanup 28 /* always called */ #define __rsnd_mod_add_probe 0 #define __rsnd_mod_add_remove 0 -#define __rsnd_mod_add_nolock_start 1 -#define __rsnd_mod_add_nolock_stop -1 +#define __rsnd_mod_add_prepare 0 +#define __rsnd_mod_add_cleanup 0 #define __rsnd_mod_add_init 1 #define __rsnd_mod_add_quit -1 #define __rsnd_mod_add_start 1 @@ -342,10 +359,11 @@ struct rsnd_mod { #define __rsnd_mod_add_fallback 0 #define __rsnd_mod_add_hw_params 0 #define __rsnd_mod_add_pointer 0 -#define __rsnd_mod_add_prepare 0 #define __rsnd_mod_call_probe 0 #define __rsnd_mod_call_remove 0 +#define __rsnd_mod_call_prepare 0 +#define __rsnd_mod_call_cleanup 0 #define __rsnd_mod_call_init 0 #define __rsnd_mod_call_quit 1 #define __rsnd_mod_call_start 0 @@ -355,9 +373,6 @@ struct rsnd_mod { #define __rsnd_mod_call_fallback 0 #define __rsnd_mod_call_hw_params 0 #define __rsnd_mod_call_pointer 0 -#define __rsnd_mod_call_nolock_start 0 -#define __rsnd_mod_call_nolock_stop 1 -#define __rsnd_mod_call_prepare 0 #define rsnd_mod_to_priv(mod) ((mod)->priv) #define rsnd_mod_name(mod) ((mod)->ops->name) @@ -438,6 +453,7 @@ struct rsnd_dai_stream { char name[RSND_DAI_NAME_SIZE]; struct snd_pcm_substream *substream; struct rsnd_mod *mod[RSND_MOD_MAX]; + struct rsnd_mod *dma; struct rsnd_dai *rdai; struct device *dmac_dev; /* for IPMMU */ u32 parent_ssi_status; @@ -467,6 +483,7 @@ struct rsnd_dai { int max_channels; /* 2ch - 16ch */ int ssi_lane; /* 1lane - 4lane */ + int chan_width; /* 16/24/32 bit width */ unsigned int clk_master:1; unsigned int bit_clk_inv:1; @@ -500,6 +517,11 @@ int rsnd_rdai_channels_ctrl(struct rsnd_dai *rdai, int rsnd_rdai_ssi_lane_ctrl(struct rsnd_dai *rdai, int ssi_lane); +#define rsnd_rdai_width_set(rdai, width) \ + rsnd_rdai_width_ctrl(rdai, width) +#define rsnd_rdai_width_get(rdai) \ + rsnd_rdai_width_ctrl(rdai, 0) +int rsnd_rdai_width_ctrl(struct rsnd_dai *rdai, int width); void rsnd_dai_period_elapsed(struct rsnd_dai_stream *io); int rsnd_dai_connect(struct rsnd_mod *mod, struct rsnd_dai_stream *io, @@ -692,6 +714,7 @@ void rsnd_ssi_remove(struct rsnd_priv *priv); struct rsnd_mod *rsnd_ssi_mod_get(struct rsnd_priv *priv, int id); int rsnd_ssi_is_dma_mode(struct rsnd_mod *mod); int rsnd_ssi_use_busif(struct rsnd_dai_stream *io); +int rsnd_ssi_get_busif(struct rsnd_dai_stream *io); u32 rsnd_ssi_multi_slaves_runtime(struct rsnd_dai_stream *io); #define RSND_SSI_HDMI_PORT0 0xf0 @@ -709,7 +732,7 @@ int __rsnd_ssi_is_pin_sharing(struct rsnd_mod *mod); void rsnd_parse_connect_ssi(struct rsnd_dai *rdai, struct device_node *playback, struct device_node *capture); -unsigned int rsnd_ssi_clk_query(struct rsnd_priv *priv, +unsigned int rsnd_ssi_clk_query(struct rsnd_dai *rdai, int param1, int param2, int *idx); /* diff --git a/sound/soc/sh/rcar/src.c b/sound/soc/sh/rcar/src.c index beccfbac7581..cd38a43b976f 100644 --- a/sound/soc/sh/rcar/src.c +++ b/sound/soc/sh/rcar/src.c @@ -158,7 +158,7 @@ static int rsnd_src_hw_params(struct rsnd_mod *mod, struct snd_soc_dpcm *dpcm; struct snd_pcm_hw_params *be_params; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { be_params = &dpcm->hw_params; if (params_rate(fe_params) != params_rate(be_params)) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index 3f880ec66459..fcb4df23248c 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -42,7 +42,13 @@ #define DWL_24 (5 << 19) /* Data Word Length */ #define DWL_32 (6 << 19) /* Data Word Length */ +/* + * System word length + */ +#define SWL_16 (1 << 16) /* R/W System Word Length */ +#define SWL_24 (2 << 16) /* R/W System Word Length */ #define SWL_32 (3 << 16) /* R/W System Word Length */ + #define SCKD (1 << 15) /* Serial Bit Clock Direction */ #define SWSD (1 << 14) /* Serial WS Direction */ #define SCKP (1 << 13) /* Serial Bit Clock Polarity */ @@ -72,7 +78,6 @@ struct rsnd_ssi { struct rsnd_mod mod; - struct rsnd_mod *dma; u32 flags; u32 cr_own; @@ -145,6 +150,11 @@ int rsnd_ssi_use_busif(struct rsnd_dai_stream *io) return use_busif; } +int rsnd_ssi_get_busif(struct rsnd_dai_stream *io) +{ + return 0; /* BUSIF0 only for now */ +} + static void rsnd_ssi_status_clear(struct rsnd_mod *mod) { rsnd_mod_write(mod, SSISR, 0); @@ -220,14 +230,32 @@ u32 rsnd_ssi_multi_slaves_runtime(struct rsnd_dai_stream *io) return 0; } -unsigned int rsnd_ssi_clk_query(struct rsnd_priv *priv, +static u32 rsnd_rdai_width_to_swl(struct rsnd_dai *rdai) +{ + struct rsnd_priv *priv = rsnd_rdai_to_priv(rdai); + struct device *dev = rsnd_priv_to_dev(priv); + int width = rsnd_rdai_width_get(rdai); + + switch (width) { + case 32: return SWL_32; + case 24: return SWL_24; + case 16: return SWL_16; + } + + dev_err(dev, "unsupported slot width value: %d\n", width); + return 0; +} + +unsigned int rsnd_ssi_clk_query(struct rsnd_dai *rdai, int param1, int param2, int *idx) { + struct rsnd_priv *priv = rsnd_rdai_to_priv(rdai); int ssi_clk_mul_table[] = { 1, 2, 4, 8, 16, 6, 12, }; int j, ret; unsigned int main_rate; + int width = rsnd_rdai_width_get(rdai); for (j = 0; j < ARRAY_SIZE(ssi_clk_mul_table); j++) { @@ -240,12 +268,7 @@ unsigned int rsnd_ssi_clk_query(struct rsnd_priv *priv, if (j == 0) continue; - /* - * this driver is assuming that - * system word is 32bit x chan - * see rsnd_ssi_init() - */ - main_rate = 32 * param1 * param2 * ssi_clk_mul_table[j]; + main_rate = width * param1 * param2 * ssi_clk_mul_table[j]; ret = rsnd_adg_clk_query(priv, main_rate); if (ret < 0) @@ -289,10 +312,15 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, return -EINVAL; } + if (ssi->chan != chan) { + dev_err(dev, "SSI parent/child should use same chan\n"); + return -EINVAL; + } + return 0; } - main_rate = rsnd_ssi_clk_query(priv, rate, chan, &idx); + main_rate = rsnd_ssi_clk_query(rdai, rate, chan, &idx); if (!main_rate) { dev_err(dev, "unsupported clock rate\n"); return -EIO; @@ -312,9 +340,11 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, * SSICR : FORCE, SCKD, SWSD * SSIWSR : CONT */ - ssi->cr_clk = FORCE | SWL_32 | SCKD | SWSD | CKDV(idx); + ssi->cr_clk = FORCE | rsnd_rdai_width_to_swl(rdai) | + SCKD | SWSD | CKDV(idx); ssi->wsr = CONT; ssi->rate = rate; + ssi->chan = chan; dev_dbg(dev, "%s[%d] outputs %u Hz\n", rsnd_mod_name(mod), @@ -340,6 +370,7 @@ static void rsnd_ssi_master_clk_stop(struct rsnd_mod *mod, ssi->cr_clk = 0; ssi->rate = 0; + ssi->chan = 0; rsnd_adg_ssi_clk_stop(mod); } @@ -357,21 +388,28 @@ static void rsnd_ssi_config_init(struct rsnd_mod *mod, is_tdm = rsnd_runtime_is_ssi_tdm(io); - /* - * always use 32bit system word. - * see also rsnd_ssi_master_clk_enable() - */ - cr_own |= FORCE | SWL_32; + cr_own |= FORCE | rsnd_rdai_width_to_swl(rdai); if (rdai->bit_clk_inv) cr_own |= SCKP; - if (rdai->frm_clk_inv ^ is_tdm) + if (rdai->frm_clk_inv && !is_tdm) cr_own |= SWSP; if (rdai->data_alignment) cr_own |= SDTA; if (rdai->sys_delay) cr_own |= DEL; + /* + * TDM Mode + * see + * rsnd_ssiu_init_gen2() + */ + wsr = ssi->wsr; + if (is_tdm) { + wsr |= WS_MODE; + cr_own |= CHNL_8; + } + /* * We shouldn't exchange SWSP after running. * This means, parent needs to care it. @@ -384,6 +422,9 @@ static void rsnd_ssi_config_init(struct rsnd_mod *mod, cr_own &= ~DWL_MASK; switch (snd_pcm_format_width(runtime->format)) { + case 8: + cr_own |= DWL_8; + break; case 16: cr_own |= DWL_16; break; @@ -399,16 +440,6 @@ static void rsnd_ssi_config_init(struct rsnd_mod *mod, cr_mode = DIEN; /* PIO : enable Data interrupt */ } - /* - * TDM Extend Mode - * see - * rsnd_ssiu_init_gen2() - */ - wsr = ssi->wsr; - if (is_tdm) { - wsr |= WS_MODE; - cr_own |= CHNL_8; - } init_end: ssi->cr_own = cr_own; ssi->cr_mode = cr_mode; @@ -488,26 +519,16 @@ static int rsnd_ssi_hw_params(struct rsnd_mod *mod, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { - struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod); - int chan = params_channels(params); + struct rsnd_dai *rdai = rsnd_io_to_rdai(io); + unsigned int fmt_width = snd_pcm_format_width(params_format(params)); - /* - * snd_pcm_ops::hw_params will be called *before* - * snd_soc_dai_ops::trigger. Thus, ssi->usrcnt is 0 - * in 1st call. - */ - if (ssi->usrcnt) { - /* - * Already working. - * It will happen if SSI has parent/child connection. - * it is error if child <-> parent SSI uses - * different channels. - */ - if (ssi->chan != chan) - return -EIO; - } + if (fmt_width > rdai->chan_width) { + struct rsnd_priv *priv = rsnd_io_to_priv(io); + struct device *dev = rsnd_priv_to_dev(priv); - ssi->chan = chan; + dev_err(dev, "invalid combination of slot-width and format-data-width\n"); + return -EINVAL; + } return 0; } @@ -873,7 +894,6 @@ static int rsnd_ssi_dma_probe(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv) { - struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod); int ret; /* @@ -888,7 +908,7 @@ static int rsnd_ssi_dma_probe(struct rsnd_mod *mod, return ret; /* SSI probe might be called many times in MUX multi path */ - ret = rsnd_dma_attach(io, mod, &ssi->dma); + ret = rsnd_dma_attach(io, mod, &io->dma); return ret; } diff --git a/sound/soc/sh/rcar/ssiu.c b/sound/soc/sh/rcar/ssiu.c index 016fbf5ac242..39b67643b5dc 100644 --- a/sound/soc/sh/rcar/ssiu.c +++ b/sound/soc/sh/rcar/ssiu.c @@ -10,9 +10,12 @@ struct rsnd_ssiu { struct rsnd_mod mod; + u32 busif_status[8]; /* for BUSIF0 - BUSIF7 */ + unsigned int usrcnt; }; #define rsnd_ssiu_nr(priv) ((priv)->ssiu_nr) +#define rsnd_mod_to_ssiu(_mod) container_of((_mod), struct rsnd_ssiu, mod) #define for_each_rsnd_ssiu(pos, priv, i) \ for (i = 0; \ (i < rsnd_ssiu_nr(priv)) && \ @@ -120,6 +123,7 @@ static int rsnd_ssiu_init_gen2(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv) { + struct rsnd_ssiu *ssiu = rsnd_mod_to_ssiu(mod); int hdmi = rsnd_ssi_hdmi_port(io); int ret; u32 mode = 0; @@ -128,6 +132,8 @@ static int rsnd_ssiu_init_gen2(struct rsnd_mod *mod, if (ret < 0) return ret; + ssiu->usrcnt++; + if (rsnd_runtime_is_ssi_tdm(io)) { /* * TDM Extend Mode @@ -140,15 +146,59 @@ static int rsnd_ssiu_init_gen2(struct rsnd_mod *mod, rsnd_mod_write(mod, SSI_MODE, mode); if (rsnd_ssi_use_busif(io)) { - rsnd_mod_write(mod, SSI_BUSIF_ADINR, - rsnd_get_adinr_bit(mod, io) | - (rsnd_io_is_play(io) ? - rsnd_runtime_channel_after_ctu(io) : - rsnd_runtime_channel_original(io))); - rsnd_mod_write(mod, SSI_BUSIF_MODE, - rsnd_get_busif_shift(io, mod) | 1); - rsnd_mod_write(mod, SSI_BUSIF_DALIGN, - rsnd_get_dalign(mod, io)); + int id = rsnd_mod_id(mod); + int busif = rsnd_ssi_get_busif(io); + + /* + * FIXME + * + * We can't support SSI9-4/5/6/7, because its address is + * out of calculation rule + */ + if ((id == 9) && (busif >= 4)) { + struct device *dev = rsnd_priv_to_dev(priv); + + dev_err(dev, "This driver doesn't support SSI%d-%d, so far", + id, busif); + } + +#define RSND_WRITE_BUSIF(i) \ + rsnd_mod_write(mod, SSI_BUSIF##i##_ADINR, \ + rsnd_get_adinr_bit(mod, io) | \ + (rsnd_io_is_play(io) ? \ + rsnd_runtime_channel_after_ctu(io) : \ + rsnd_runtime_channel_original(io))); \ + rsnd_mod_write(mod, SSI_BUSIF##i##_MODE, \ + rsnd_get_busif_shift(io, mod) | 1); \ + rsnd_mod_write(mod, SSI_BUSIF##i##_DALIGN, \ + rsnd_get_dalign(mod, io)) + + switch (busif) { + case 0: + RSND_WRITE_BUSIF(0); + break; + case 1: + RSND_WRITE_BUSIF(1); + break; + case 2: + RSND_WRITE_BUSIF(2); + break; + case 3: + RSND_WRITE_BUSIF(3); + break; + case 4: + RSND_WRITE_BUSIF(4); + break; + case 5: + RSND_WRITE_BUSIF(5); + break; + case 6: + RSND_WRITE_BUSIF(6); + break; + case 7: + RSND_WRITE_BUSIF(7); + break; + } } if (hdmi) { @@ -194,10 +244,12 @@ static int rsnd_ssiu_start_gen2(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv) { + int busif = rsnd_ssi_get_busif(io); + if (!rsnd_ssi_use_busif(io)) return 0; - rsnd_mod_write(mod, SSI_CTRL, 0x1); + rsnd_mod_bset(mod, SSI_CTRL, 1 << (busif * 4), 1 << (busif * 4)); if (rsnd_ssi_multi_slaves_runtime(io)) rsnd_mod_write(mod, SSI_CONTROL, 0x1); @@ -209,10 +261,16 @@ static int rsnd_ssiu_stop_gen2(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv) { + struct rsnd_ssiu *ssiu = rsnd_mod_to_ssiu(mod); + int busif = rsnd_ssi_get_busif(io); + if (!rsnd_ssi_use_busif(io)) return 0; - rsnd_mod_write(mod, SSI_CTRL, 0); + rsnd_mod_bset(mod, SSI_CTRL, 1 << (busif * 4), 0); + + if (--ssiu->usrcnt) + return 0; if (rsnd_ssi_multi_slaves_runtime(io)) rsnd_mod_write(mod, SSI_CONTROL, 0); @@ -246,6 +304,16 @@ int rsnd_ssiu_attach(struct rsnd_dai_stream *io, return rsnd_dai_connect(mod, io, mod->type); } +static u32 *rsnd_ssiu_get_status(struct rsnd_dai_stream *io, + struct rsnd_mod *mod, + enum rsnd_mod_type type) +{ + struct rsnd_ssiu *ssiu = rsnd_mod_to_ssiu(mod); + int busif = rsnd_ssi_get_busif(io); + + return &ssiu->busif_status[busif]; +} + int rsnd_ssiu_probe(struct rsnd_priv *priv) { struct device *dev = rsnd_priv_to_dev(priv); @@ -269,7 +337,7 @@ int rsnd_ssiu_probe(struct rsnd_priv *priv) for_each_rsnd_ssiu(ssiu, priv, i) { ret = rsnd_mod_init(priv, rsnd_mod_get(ssiu), - ops, NULL, rsnd_mod_get_status, + ops, NULL, rsnd_ssiu_get_status, RSND_MOD_SSIU, i); if (ret) return ret; diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index 409d082e80d1..699397a09167 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -157,7 +157,7 @@ static int soc_compr_open_fe(struct snd_compr_stream *cstream) ret = dpcm_be_dai_startup(fe, stream); if (ret < 0) { /* clean up all links */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) + for_each_dpcm_be(fe, stream, dpcm) dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE; dpcm_be_disconnect(fe, stream); @@ -321,7 +321,7 @@ static int soc_compr_free_fe(struct snd_compr_stream *cstream) ret = dpcm_be_dai_shutdown(fe, stream); /* mark FE's links ready to prune */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) + for_each_dpcm_be(fe, stream, dpcm) dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE; dpcm_dapm_stream_event(fe, stream, SND_SOC_DAPM_STREAM_STOP); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 473eefe8658e..6ddcf12bc030 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -52,6 +52,10 @@ EXPORT_SYMBOL_GPL(snd_soc_debugfs_root); static DEFINE_MUTEX(client_mutex); static LIST_HEAD(component_list); +static LIST_HEAD(unbind_card_list); + +#define for_each_component(component) \ + list_for_each_entry(component, &component_list, list) /* * This is a timeout to do a DAPM powerdown after a stream is closed(). @@ -62,8 +66,9 @@ static int pmdown_time = 5000; module_param(pmdown_time, int, 0); MODULE_PARM_DESC(pmdown_time, "DAPM stream powerdown time (msecs)"); -/* If a DMI filed contain strings in this blacklist (e.g. - * "Type2 - Board Manufacturer" or "Type1 - TBD by OEM"), it will be taken +/* + * If a DMI filed contain strings in this blacklist (e.g. + * "Type2 - Board Manufacturer" or "Type1 - TBD by OEM"), it will be taken * as invalid and dropped when setting the card long name from DMI info. */ static const char * const dmi_blacklist[] = { @@ -175,8 +180,8 @@ static int dai_list_show(struct seq_file *m, void *v) mutex_lock(&client_mutex); - list_for_each_entry(component, &component_list, list) - list_for_each_entry(dai, &component->dai_list, list) + for_each_component(component) + for_each_component_dais(component, dai) seq_printf(m, "%s\n", dai->name); mutex_unlock(&client_mutex); @@ -191,7 +196,7 @@ static int component_list_show(struct seq_file *m, void *v) mutex_lock(&client_mutex); - list_for_each_entry(component, &component_list, list) + for_each_component(component) seq_printf(m, "%s\n", component->name); mutex_unlock(&client_mutex); @@ -218,7 +223,7 @@ static void soc_init_card_debugfs(struct snd_soc_card *card) &card->pop_time); if (!card->debugfs_pop_time) dev_warn(card->dev, - "ASoC: Failed to create pop time debugfs file\n"); + "ASoC: Failed to create pop time debugfs file\n"); } static void soc_cleanup_card_debugfs(struct snd_soc_card *card) @@ -341,7 +346,7 @@ struct snd_pcm_substream *snd_soc_get_dai_substream(struct snd_soc_card *card, { struct snd_soc_pcm_runtime *rtd; - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->dai_link->no_pcm && !strcmp(rtd->dai_link->name, dai_link)) return rtd->pcm->streams[stream].substream; @@ -398,7 +403,7 @@ static void soc_remove_pcm_runtimes(struct snd_soc_card *card) { struct snd_soc_pcm_runtime *rtd, *_rtd; - list_for_each_entry_safe(rtd, _rtd, &card->rtd_list, list) { + for_each_card_rtds_safe(card, rtd, _rtd) { list_del(&rtd->list); soc_free_pcm_runtime(rtd); } @@ -411,7 +416,7 @@ struct snd_soc_pcm_runtime *snd_soc_get_pcm_runtime(struct snd_soc_card *card, { struct snd_soc_pcm_runtime *rtd; - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (!strcmp(rtd->dai_link->name, dai_link)) return rtd; } @@ -422,7 +427,8 @@ EXPORT_SYMBOL_GPL(snd_soc_get_pcm_runtime); static void codec2codec_close_delayed_work(struct work_struct *work) { - /* Currently nothing to do for c2c links + /* + * Currently nothing to do for c2c links * Since c2c links are internal nodes in the DAPM graph and * don't interface with the outside world or application layer * we don't have to do any special handling on close. @@ -442,8 +448,9 @@ int snd_soc_suspend(struct device *dev) if (!card->instantiated) return 0; - /* Due to the resume being scheduled into a workqueue we could - * suspend before that's finished - wait for it to complete. + /* + * Due to the resume being scheduled into a workqueue we could + * suspend before that's finished - wait for it to complete. */ snd_power_wait(card->snd_card, SNDRV_CTL_POWER_D0); @@ -451,13 +458,13 @@ int snd_soc_suspend(struct device *dev) snd_power_change_state(card->snd_card, SNDRV_CTL_POWER_D3hot); /* mute any active DACs */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { + struct snd_soc_dai *dai; if (rtd->dai_link->ignore_suspend) continue; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, dai) { struct snd_soc_dai_driver *drv = dai->driver; if (drv->ops->digital_mute && dai->playback_active) @@ -466,7 +473,7 @@ int snd_soc_suspend(struct device *dev) } /* suspend all pcms */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->dai_link->ignore_suspend) continue; @@ -476,7 +483,7 @@ int snd_soc_suspend(struct device *dev) if (card->suspend_pre) card->suspend_pre(card); - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; if (rtd->dai_link->ignore_suspend) @@ -487,10 +494,10 @@ int snd_soc_suspend(struct device *dev) } /* close any waiting streams */ - list_for_each_entry(rtd, &card->rtd_list, list) + for_each_card_rtds(card, rtd) flush_delayed_work(&rtd->delayed_work); - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->dai_link->ignore_suspend) continue; @@ -509,11 +516,14 @@ int snd_soc_suspend(struct device *dev) snd_soc_dapm_sync(&card->dapm); /* suspend all COMPONENTs */ - list_for_each_entry(component, &card->component_dev_list, card_list) { - struct snd_soc_dapm_context *dapm = snd_soc_component_get_dapm(component); + for_each_card_components(card, component) { + struct snd_soc_dapm_context *dapm = + snd_soc_component_get_dapm(component); - /* If there are paths active then the COMPONENT will be held with - * bias _ON and should not be suspended. */ + /* + * If there are paths active then the COMPONENT will be held + * with bias _ON and should not be suspended. + */ if (!component->suspended) { switch (snd_soc_dapm_get_bias_level(dapm)) { case SND_SOC_BIAS_STANDBY: @@ -547,7 +557,7 @@ int snd_soc_suspend(struct device *dev) } } - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; if (rtd->dai_link->ignore_suspend) @@ -567,18 +577,21 @@ int snd_soc_suspend(struct device *dev) } EXPORT_SYMBOL_GPL(snd_soc_suspend); -/* deferred resume work, so resume can complete before we finished +/* + * deferred resume work, so resume can complete before we finished * setting our codec back up, which can be very slow on I2C */ static void soc_resume_deferred(struct work_struct *work) { struct snd_soc_card *card = - container_of(work, struct snd_soc_card, deferred_resume_work); + container_of(work, struct snd_soc_card, + deferred_resume_work); struct snd_soc_pcm_runtime *rtd; struct snd_soc_component *component; int i; - /* our power state is still SNDRV_CTL_POWER_D3hot from suspend time, + /* + * our power state is still SNDRV_CTL_POWER_D3hot from suspend time, * so userspace apps are blocked from touching us */ @@ -591,7 +604,7 @@ static void soc_resume_deferred(struct work_struct *work) card->resume_pre(card); /* resume control bus DAIs */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; if (rtd->dai_link->ignore_suspend) @@ -601,7 +614,7 @@ static void soc_resume_deferred(struct work_struct *work) cpu_dai->driver->resume(cpu_dai); } - list_for_each_entry(component, &card->component_dev_list, card_list) { + for_each_card_components(card, component) { if (component->suspended) { if (component->driver->resume) component->driver->resume(component); @@ -609,7 +622,7 @@ static void soc_resume_deferred(struct work_struct *work) } } - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->dai_link->ignore_suspend) continue; @@ -624,13 +637,13 @@ static void soc_resume_deferred(struct work_struct *work) } /* unmute any active DACs */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { + struct snd_soc_dai *dai; if (rtd->dai_link->ignore_suspend) continue; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, dai) { struct snd_soc_dai_driver *drv = dai->driver; if (drv->ops->digital_mute && dai->playback_active) @@ -638,7 +651,7 @@ static void soc_resume_deferred(struct work_struct *work) } } - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; if (rtd->dai_link->ignore_suspend) @@ -673,16 +686,15 @@ int snd_soc_resume(struct device *dev) return 0; /* activate pins from sleep state */ - list_for_each_entry(rtd, &card->rtd_list, list) { - struct snd_soc_dai **codec_dais = rtd->codec_dais; + for_each_card_rtds(card, rtd) { + struct snd_soc_dai *codec_dai; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; int j; if (cpu_dai->active) pinctrl_pm_select_default_state(cpu_dai->dev); - for (j = 0; j < rtd->num_codecs; j++) { - struct snd_soc_dai *codec_dai = codec_dais[j]; + for_each_rtd_codec_dai(rtd, j, codec_dai) { if (codec_dai->active) pinctrl_pm_select_default_state(codec_dai->dev); } @@ -694,8 +706,9 @@ int snd_soc_resume(struct device *dev) * have that problem and may take a substantial amount of time to resume * due to I/O costs and anti-pop so handle them out of line. */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + bus_control |= cpu_dai->driver->bus_control; } if (bus_control) { @@ -725,7 +738,7 @@ static struct snd_soc_component *soc_find_component( lockdep_assert_held(&client_mutex); - list_for_each_entry(component, &component_list, list) { + for_each_component(component) { if (of_node) { if (component->dev->of_node == of_node) return component; @@ -737,6 +750,24 @@ static struct snd_soc_component *soc_find_component( return NULL; } +static int snd_soc_is_matching_component( + const struct snd_soc_dai_link_component *dlc, + struct snd_soc_component *component) +{ + struct device_node *component_of_node; + + component_of_node = component->dev->of_node; + if (!component_of_node && component->dev->parent) + component_of_node = component->dev->parent->of_node; + + if (dlc->of_node && component_of_node != dlc->of_node) + return 0; + if (dlc->name && strcmp(component->name, dlc->name)) + return 0; + + return 1; +} + /** * snd_soc_find_dai - Find a registered DAI * @@ -753,21 +784,14 @@ struct snd_soc_dai *snd_soc_find_dai( { struct snd_soc_component *component; struct snd_soc_dai *dai; - struct device_node *component_of_node; lockdep_assert_held(&client_mutex); - /* Find CPU DAI from registered DAIs*/ - list_for_each_entry(component, &component_list, list) { - component_of_node = component->dev->of_node; - if (!component_of_node && component->dev->parent) - component_of_node = component->dev->parent->of_node; - - if (dlc->of_node && component_of_node != dlc->of_node) - continue; - if (dlc->name && strcmp(component->name, dlc->name)) + /* Find CPU DAI from registered DAIs */ + for_each_component(component) { + if (!snd_soc_is_matching_component(dlc, component)) continue; - list_for_each_entry(dai, &component->dai_list, list) { + for_each_component_dais(component, dai) { if (dlc->dai_name && strcmp(dai->name, dlc->dai_name) && (!dai->driver->name || strcmp(dai->driver->name, dlc->dai_name))) @@ -781,7 +805,6 @@ struct snd_soc_dai *snd_soc_find_dai( } EXPORT_SYMBOL_GPL(snd_soc_find_dai); - /** * snd_soc_find_dai_link - Find a DAI link * @@ -805,7 +828,7 @@ struct snd_soc_dai_link *snd_soc_find_dai_link(struct snd_soc_card *card, lockdep_assert_held(&client_mutex); - list_for_each_entry_safe(link, _link, &card->dai_link_list, list) { + for_each_card_links_safe(card, link, _link) { if (link->id != id) continue; @@ -828,7 +851,7 @@ static bool soc_is_dai_link_bound(struct snd_soc_card *card, { struct snd_soc_pcm_runtime *rtd; - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (rtd->dai_link == dai_link) return true; } @@ -844,8 +867,6 @@ static int soc_bind_dai_link(struct snd_soc_card *card, struct snd_soc_dai_link_component cpu_dai_component; struct snd_soc_component *component; struct snd_soc_dai **codec_dais; - struct device_node *platform_of_node; - const char *platform_name; int i; if (dai_link->ignore) @@ -877,6 +898,7 @@ static int soc_bind_dai_link(struct snd_soc_card *card, rtd->num_codecs = dai_link->num_codecs; /* Find CODEC from registered CODECs */ + /* we can use for_each_rtd_codec_dai() after this */ codec_dais = rtd->codec_dais; for (i = 0; i < rtd->num_codecs; i++) { codec_dais[i] = snd_soc_find_dai(&codecs[i]); @@ -891,24 +913,11 @@ static int soc_bind_dai_link(struct snd_soc_card *card, /* Single codec links expect codec and codec_dai in runtime data */ rtd->codec_dai = codec_dais[0]; - /* if there's no platform we match on the empty platform */ - platform_name = dai_link->platform_name; - if (!platform_name && !dai_link->platform_of_node) - platform_name = "snd-soc-dummy"; - /* find one from the set of registered platforms */ - list_for_each_entry(component, &component_list, list) { - platform_of_node = component->dev->of_node; - if (!platform_of_node && component->dev->parent->of_node) - platform_of_node = component->dev->parent->of_node; - - if (dai_link->platform_of_node) { - if (platform_of_node != dai_link->platform_of_node) - continue; - } else { - if (strcmp(component->name, platform_name)) - continue; - } + for_each_component(component) { + if (!snd_soc_is_matching_component(dai_link->platform, + component)) + continue; snd_soc_rtdcom_add(rtd, component); } @@ -918,7 +927,7 @@ static int soc_bind_dai_link(struct snd_soc_card *card, _err_defer: soc_free_pcm_runtime(rtd); - return -EPROBE_DEFER; + return -EPROBE_DEFER; } static void soc_remove_component(struct snd_soc_component *component) @@ -942,23 +951,25 @@ static void soc_remove_dai(struct snd_soc_dai *dai, int order) { int err; - if (dai && dai->probed && - dai->driver->remove_order == order) { - if (dai->driver->remove) { - err = dai->driver->remove(dai); - if (err < 0) - dev_err(dai->dev, - "ASoC: failed to remove %s: %d\n", - dai->name, err); - } - dai->probed = 0; + if (!dai || !dai->probed || + dai->driver->remove_order != order) + return; + + if (dai->driver->remove) { + err = dai->driver->remove(dai); + if (err < 0) + dev_err(dai->dev, + "ASoC: failed to remove %s: %d\n", + dai->name, err); } + dai->probed = 0; } static void soc_remove_link_dais(struct snd_soc_card *card, struct snd_soc_pcm_runtime *rtd, int order) { int i; + struct snd_soc_dai *codec_dai; /* unregister the rtd device */ if (rtd->dev_registered) { @@ -967,8 +978,8 @@ static void soc_remove_link_dais(struct snd_soc_card *card, } /* remove the CODEC DAI */ - for (i = 0; i < rtd->num_codecs; i++) - soc_remove_dai(rtd->codec_dais[i], order); + for_each_rtd_codec_dai(rtd, i, codec_dai) + soc_remove_dai(codec_dai, order); soc_remove_dai(rtd->cpu_dai, order); } @@ -993,28 +1004,57 @@ static void soc_remove_dai_links(struct snd_soc_card *card) struct snd_soc_pcm_runtime *rtd; struct snd_soc_dai_link *link, *_link; - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { - list_for_each_entry(rtd, &card->rtd_list, list) + for_each_comp_order(order) { + for_each_card_rtds(card, rtd) soc_remove_link_dais(card, rtd, order); } - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { - list_for_each_entry(rtd, &card->rtd_list, list) + for_each_comp_order(order) { + for_each_card_rtds(card, rtd) soc_remove_link_components(card, rtd, order); } - list_for_each_entry_safe(link, _link, &card->dai_link_list, list) { + for_each_card_links_safe(card, link, _link) { if (link->dobj.type == SND_SOC_DOBJ_DAI_LINK) dev_warn(card->dev, "Topology forgot to remove link %s?\n", link->name); list_del(&link->list); - card->num_dai_links--; } } +static int snd_soc_init_platform(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_link) +{ + struct snd_soc_dai_link_component *platform = dai_link->platform; + + /* + * FIXME + * + * this function should be removed in the future + */ + /* convert Legacy platform link */ + if (!platform) { + platform = devm_kzalloc(card->dev, + sizeof(struct snd_soc_dai_link_component), + GFP_KERNEL); + if (!platform) + return -ENOMEM; + + dai_link->platform = platform; + platform->name = dai_link->platform_name; + platform->of_node = dai_link->platform_of_node; + platform->dai_name = NULL; + } + + /* if there's no platform we match on the empty platform */ + if (!platform->name && + !platform->of_node) + platform->name = "snd-soc-dummy"; + + return 0; +} + static int snd_soc_init_multicodec(struct snd_soc_card *card, struct snd_soc_dai_link *dai_link) { @@ -1043,9 +1083,16 @@ static int snd_soc_init_multicodec(struct snd_soc_card *card, } static int soc_init_dai_link(struct snd_soc_card *card, - struct snd_soc_dai_link *link) + struct snd_soc_dai_link *link) { int i, ret; + struct snd_soc_dai_link_component *codec; + + ret = snd_soc_init_platform(card, link); + if (ret) { + dev_err(card->dev, "ASoC: failed to init multiplatform\n"); + return ret; + } ret = snd_soc_init_multicodec(card, link); if (ret) { @@ -1053,19 +1100,19 @@ static int soc_init_dai_link(struct snd_soc_card *card, return ret; } - for (i = 0; i < link->num_codecs; i++) { + for_each_link_codecs(link, i, codec) { /* * Codec must be specified by 1 of name or OF node, * not both or neither. */ - if (!!link->codecs[i].name == - !!link->codecs[i].of_node) { + if (!!codec->name == + !!codec->of_node) { dev_err(card->dev, "ASoC: Neither/both codec name/of_node are set for %s\n", link->name); return -EINVAL; } /* Codec DAI name must be specified */ - if (!link->codecs[i].dai_name) { + if (!codec->dai_name) { dev_err(card->dev, "ASoC: codec_dai_name not set for %s\n", link->name); return -EINVAL; @@ -1076,13 +1123,12 @@ static int soc_init_dai_link(struct snd_soc_card *card, * Platform may be specified by either name or OF node, but * can be left unspecified, and a dummy platform will be used. */ - if (link->platform_name && link->platform_of_node) { + if (link->platform->name && link->platform->of_node) { dev_err(card->dev, "ASoC: Both platform name/of_node are set for %s\n", link->name); return -EINVAL; } - /* * CPU device may be specified by either name or OF node, but * can be left unspecified, and will be matched based on DAI @@ -1111,7 +1157,8 @@ static int soc_init_dai_link(struct snd_soc_card *card, void snd_soc_disconnect_sync(struct device *dev) { - struct snd_soc_component *component = snd_soc_lookup_component(dev, NULL); + struct snd_soc_component *component = + snd_soc_lookup_component(dev, NULL); if (!component || !component->card) return; @@ -1142,14 +1189,14 @@ int snd_soc_add_dai_link(struct snd_soc_card *card, } lockdep_assert_held(&client_mutex); - /* Notify the machine driver for extra initialization + /* + * Notify the machine driver for extra initialization * on the link created by topology. */ if (dai_link->dobj.type && card->add_dai_link) card->add_dai_link(card, dai_link); list_add_tail(&dai_link->list, &card->dai_link_list); - card->num_dai_links++; return 0; } @@ -1178,16 +1225,16 @@ void snd_soc_remove_dai_link(struct snd_soc_card *card, } lockdep_assert_held(&client_mutex); - /* Notify the machine driver for extra destruction + /* + * Notify the machine driver for extra destruction * on the link created by topology. */ if (dai_link->dobj.type && card->remove_dai_link) card->remove_dai_link(card, dai_link); - list_for_each_entry_safe(link, _link, &card->dai_link_list, list) { + for_each_card_links_safe(card, link, _link) { if (link == dai_link) { list_del(&link->list); - card->num_dai_links--; return; } } @@ -1239,7 +1286,8 @@ static void soc_set_name_prefix(struct snd_soc_card *card, static int soc_probe_component(struct snd_soc_card *card, struct snd_soc_component *component) { - struct snd_soc_dapm_context *dapm = snd_soc_component_get_dapm(component); + struct snd_soc_dapm_context *dapm = + snd_soc_component_get_dapm(component); struct snd_soc_dai *dai; int ret; @@ -1277,7 +1325,7 @@ static int soc_probe_component(struct snd_soc_card *card, } } - list_for_each_entry(dai, &component->dai_list, list) { + for_each_component_dais(component, dai) { ret = snd_soc_dapm_new_dai_widgets(dapm, dai); if (ret != 0) { dev_err(component->dev, @@ -1320,6 +1368,7 @@ static int soc_probe_component(struct snd_soc_card *card, component->driver->num_dapm_routes); list_add(&dapm->list, &card->dapm_list); + /* see for_each_card_components */ list_add(&component->card_list, &card->component_dev_list); return 0; @@ -1370,8 +1419,7 @@ static int soc_post_component_init(struct snd_soc_pcm_runtime *rtd, } static int soc_probe_link_components(struct snd_soc_card *card, - struct snd_soc_pcm_runtime *rtd, - int order) + struct snd_soc_pcm_runtime *rtd, int order) { struct snd_soc_component *component; struct snd_soc_rtdcom_list *rtdcom; @@ -1398,6 +1446,7 @@ static int soc_probe_dai(struct snd_soc_dai *dai, int order) if (dai->driver->probe) { int ret = dai->driver->probe(dai); + if (ret < 0) { dev_err(dai->dev, "ASoC: failed to probe DAI %s: %d\n", dai->name, ret); @@ -1431,48 +1480,6 @@ static int soc_link_dai_pcm_new(struct snd_soc_dai **dais, int num_dais, return 0; } -static int soc_link_dai_widgets(struct snd_soc_card *card, - struct snd_soc_dai_link *dai_link, - struct snd_soc_pcm_runtime *rtd) -{ - struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - struct snd_soc_dai *codec_dai = rtd->codec_dai; - struct snd_soc_dapm_widget *sink, *source; - int ret; - - if (rtd->num_codecs > 1) - dev_warn(card->dev, "ASoC: Multiple codecs not supported yet\n"); - - /* link the DAI widgets */ - sink = codec_dai->playback_widget; - source = cpu_dai->capture_widget; - if (sink && source) { - ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params, - dai_link->num_params, - source, sink); - if (ret != 0) { - dev_err(card->dev, "ASoC: Can't link %s to %s: %d\n", - sink->name, source->name, ret); - return ret; - } - } - - sink = cpu_dai->playback_widget; - source = codec_dai->capture_widget; - if (sink && source) { - ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params, - dai_link->num_params, - source, sink); - if (ret != 0) { - dev_err(card->dev, "ASoC: Can't link %s to %s: %d\n", - sink->name, source->name, ret); - return ret; - } - } - - return 0; -} - static int soc_probe_link_dais(struct snd_soc_card *card, struct snd_soc_pcm_runtime *rtd, int order) { @@ -1480,6 +1487,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, struct snd_soc_dai *cpu_dai = rtd->cpu_dai; struct snd_soc_rtdcom_list *rtdcom; struct snd_soc_component *component; + struct snd_soc_dai *codec_dai; int i, ret, num; dev_dbg(card->dev, "ASoC: probe %s dai link %d late %d\n", @@ -1493,8 +1501,8 @@ static int soc_probe_link_dais(struct snd_soc_card *card, return ret; /* probe the CODEC DAI */ - for (i = 0; i < rtd->num_codecs; i++) { - ret = soc_probe_dai(rtd->codec_dais[i], order); + for_each_rtd_codec_dai(rtd, i, codec_dai) { + ret = soc_probe_dai(codec_dai, order); if (ret) return ret; } @@ -1546,7 +1554,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, } if (cpu_dai->driver->compress_new) { - /*create compress_device"*/ + /* create compress_device" */ ret = cpu_dai->driver->compress_new(rtd, num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create compress %s\n", @@ -1560,7 +1568,7 @@ static int soc_probe_link_dais(struct snd_soc_card *card, ret = soc_new_pcm(rtd, num); if (ret < 0) { dev_err(card->dev, "ASoC: can't create pcm %s :%d\n", - dai_link->stream_name, ret); + dai_link->stream_name, ret); return ret; } ret = soc_link_dai_pcm_new(&cpu_dai, 1, rtd); @@ -1573,11 +1581,6 @@ static int soc_probe_link_dais(struct snd_soc_card *card, } else { INIT_DELAYED_WORK(&rtd->delayed_work, codec2codec_close_delayed_work); - - /* link the DAI widgets */ - ret = soc_link_dai_widgets(card, dai_link, rtd); - if (ret) - return ret; } } @@ -1628,8 +1631,7 @@ static int soc_probe_aux_devices(struct snd_soc_card *card) int order; int ret; - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { + for_each_comp_order(order) { list_for_each_entry(comp, &card->aux_comp_list, card_aux_list) { if (comp->driver->probe_order == order) { ret = soc_probe_component(card, comp); @@ -1651,8 +1653,7 @@ static void soc_remove_aux_devices(struct snd_soc_card *card) struct snd_soc_component *comp, *_comp; int order; - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { + for_each_comp_order(order) { list_for_each_entry_safe(comp, _comp, &card->aux_comp_list, card_aux_list) { @@ -1681,14 +1682,12 @@ static void soc_remove_aux_devices(struct snd_soc_card *card) int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd, unsigned int dai_fmt) { - struct snd_soc_dai **codec_dais = rtd->codec_dais; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; unsigned int i; int ret; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = codec_dais[i]; - + for_each_rtd_codec_dai(rtd, i, codec_dai) { ret = snd_soc_dai_set_fmt(codec_dai, dai_fmt); if (ret != 0 && ret != -ENOTSUPP) { dev_warn(codec_dai->dev, @@ -1697,8 +1696,10 @@ int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd, } } - /* Flip the polarity for the "CPU" end of a CODEC<->CODEC link */ - /* the component which has non_legacy_dai_naming is Codec */ + /* + * Flip the polarity for the "CPU" end of a CODEC<->CODEC link + * the component which has non_legacy_dai_naming is Codec + */ if (cpu_dai->component->driver->non_legacy_dai_naming) { unsigned int inv_dai_fmt; @@ -1732,9 +1733,9 @@ int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd, } EXPORT_SYMBOL_GPL(snd_soc_runtime_set_dai_fmt); - #ifdef CONFIG_DMI -/* Trim special characters, and replace '-' with '_' since '-' is used to +/* + * Trim special characters, and replace '-' with '_' since '-' is used to * separate different DMI fields in the card long name. Only number and * alphabet characters and a few separator characters are kept. */ @@ -1753,7 +1754,8 @@ static void cleanup_dmi_name(char *name) name[j] = '\0'; } -/* Check if a DMI field is valid, i.e. not containing any string +/* + * Check if a DMI field is valid, i.e. not containing any string * in the black list. */ static int is_dmi_valid(const char *field) @@ -1816,7 +1818,6 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour) return 0; } - snprintf(card->dmi_longname, sizeof(card->snd_card->longname), "%s", vendor); cleanup_dmi_name(card->dmi_longname); @@ -1832,7 +1833,8 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour) if (len < longname_buf_size) cleanup_dmi_name(card->dmi_longname + len); - /* some vendors like Lenovo may only put a self-explanatory + /* + * some vendors like Lenovo may only put a self-explanatory * name in the product version field */ product_version = dmi_get_system_info(DMI_PRODUCT_VERSION); @@ -1891,7 +1893,7 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) struct snd_soc_dai_link *dai_link; int i; - list_for_each_entry(component, &component_list, list) { + for_each_component(component) { /* does this component override FEs ? */ if (!component->driver->ignore_machine) @@ -1903,9 +1905,7 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) continue; /* machine matches, so override the rtd data */ - for (i = 0; i < card->num_links; i++) { - - dai_link = &card->dai_link[i]; + for_each_card_prelinks(card, i, dai_link) { /* ignore this FE */ if (dai_link->dynamic) { @@ -1917,7 +1917,11 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) card->dai_link[i].name); /* override platform component */ - dai_link->platform_name = component->name; + if (snd_soc_init_platform(card, dai_link) < 0) { + dev_err(card->dev, "init platform error"); + continue; + } + dai_link->platform->name = component->name; /* convert non BE into BE */ dai_link->no_pcm = 1; @@ -1926,7 +1930,8 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) dai_link->be_hw_params_fixup = component->driver->be_hw_params_fixup; - /* most BE links don't set stream name, so set it to + /* + * most BE links don't set stream name, so set it to * dai link name if it's NULL to help bind widgets. */ if (!dai_link->stream_name) @@ -1936,7 +1941,7 @@ static void soc_check_tplg_fes(struct snd_soc_card *card) /* Inform userspace we are using alternate topology */ if (component->driver->topology_name_prefix) { - /* topology shortname created ? */ + /* topology shortname created? */ if (!card->topology_shortname_created) { comp_drv = component->driver; @@ -1965,8 +1970,8 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) soc_check_tplg_fes(card); /* bind DAIs */ - for (i = 0; i < card->num_links; i++) { - ret = soc_bind_dai_link(card, &card->dai_link[i]); + for_each_card_prelinks(card, i, dai_link) { + ret = soc_bind_dai_link(card, dai_link); if (ret != 0) goto base_error; } @@ -1979,8 +1984,8 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) } /* add predefined DAI links to the list */ - for (i = 0; i < card->num_links; i++) - snd_soc_add_dai_link(card, card->dai_link+i); + for_each_card_prelinks(card, i, dai_link) + snd_soc_add_dai_link(card, dai_link); /* card bind complete so register a sound card */ ret = snd_card_new(card->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, @@ -2024,9 +2029,8 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) } /* probe all components used by DAI links on this card */ - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_comp_order(order) { + for_each_card_rtds(card, rtd) { ret = soc_probe_link_components(card, rtd, order); if (ret < 0) { dev_err(card->dev, @@ -2042,10 +2046,11 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) if (ret < 0) goto probe_dai_err; - /* Find new DAI links added during probing components and bind them. + /* + * Find new DAI links added during probing components and bind them. * Components with topology may bring new DAIs and DAI links. */ - list_for_each_entry(dai_link, &card->dai_link_list, list) { + for_each_card_links(card, dai_link) { if (soc_is_dai_link_bound(card, dai_link)) continue; @@ -2058,9 +2063,8 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) } /* probe all DAI links on this card */ - for (order = SND_SOC_COMP_ORDER_FIRST; order <= SND_SOC_COMP_ORDER_LAST; - order++) { - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_comp_order(order) { + for_each_card_rtds(card, rtd) { ret = soc_probe_link_dais(card, rtd, order); if (ret < 0) { dev_err(card->dev, @@ -2075,7 +2079,8 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card) snd_soc_dapm_connect_dai_link_widgets(card); if (card->controls) - snd_soc_add_card_controls(card, card->controls, card->num_controls); + snd_soc_add_card_controls(card, card->controls, + card->num_controls); if (card->dapm_routes) snd_soc_dapm_add_routes(&card->dapm, card->dapm_routes, @@ -2181,7 +2186,7 @@ static int soc_cleanup_card_resources(struct snd_soc_card *card) struct snd_soc_pcm_runtime *rtd; /* make sure any delayed work runs */ - list_for_each_entry(rtd, &card->rtd_list, list) + for_each_card_rtds(card, rtd) flush_delayed_work(&rtd->delayed_work); /* free the ALSA card at first; this syncs with pending operations */ @@ -2221,21 +2226,23 @@ int snd_soc_poweroff(struct device *dev) if (!card->instantiated) return 0; - /* Flush out pmdown_time work - we actually do want to run it - * now, we're shutting down so no imminent restart. */ - list_for_each_entry(rtd, &card->rtd_list, list) + /* + * Flush out pmdown_time work - we actually do want to run it + * now, we're shutting down so no imminent restart. + */ + for_each_card_rtds(card, rtd) flush_delayed_work(&rtd->delayed_work); snd_soc_dapm_shutdown(card); /* deactivate pins to sleep state */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; int i; pinctrl_pm_select_sleep_state(cpu_dai->dev); - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { pinctrl_pm_select_sleep_state(codec_dai->dev); } } @@ -2315,6 +2322,7 @@ static int snd_soc_add_controls(struct snd_card *card, struct device *dev, for (i = 0; i < num_controls; i++) { const struct snd_kcontrol_new *control = &controls[i]; + err = snd_ctl_add(card, snd_soc_cnew(control, data, control->name, prefix)); if (err < 0) { @@ -2432,8 +2440,9 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_set_sysclk); * * Configures the CODEC master (MCLK) or system (SYSCLK) clocking. */ -int snd_soc_component_set_sysclk(struct snd_soc_component *component, int clk_id, - int source, unsigned int freq, int dir) +int snd_soc_component_set_sysclk(struct snd_soc_component *component, + int clk_id, int source, unsigned int freq, + int dir) { if (component->driver->set_sysclk) return component->driver->set_sysclk(component, clk_id, source, @@ -2501,7 +2510,7 @@ int snd_soc_component_set_pll(struct snd_soc_component *component, int pll_id, { if (component->driver->set_pll) return component->driver->set_pll(component, pll_id, source, - freq_in, freq_out); + freq_in, freq_out); return -EINVAL; } @@ -2532,8 +2541,6 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_set_bclk_ratio); */ int snd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) { - if (dai->driver == NULL) - return -EINVAL; if (dai->driver->ops->set_fmt == NULL) return -ENOTSUPP; return dai->driver->ops->set_fmt(dai, fmt); @@ -2549,8 +2556,8 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_set_fmt); * Generates the TDM tx and rx slot default masks for DAI. */ static int snd_soc_xlate_tdm_slot_mask(unsigned int slots, - unsigned int *tx_mask, - unsigned int *rx_mask) + unsigned int *tx_mask, + unsigned int *rx_mask) { if (*tx_mask || *rx_mask) return 0; @@ -2680,9 +2687,6 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_set_tristate); int snd_soc_dai_digital_mute(struct snd_soc_dai *dai, int mute, int direction) { - if (!dai->driver) - return -ENOTSUPP; - if (dai->driver->ops->mute_stream) return dai->driver->ops->mute_stream(dai, mute, direction); else if (direction == SNDRV_PCM_STREAM_PLAYBACK && @@ -2693,6 +2697,33 @@ int snd_soc_dai_digital_mute(struct snd_soc_dai *dai, int mute, } EXPORT_SYMBOL_GPL(snd_soc_dai_digital_mute); +static int snd_soc_bind_card(struct snd_soc_card *card) +{ + struct snd_soc_pcm_runtime *rtd; + int ret; + + ret = snd_soc_instantiate_card(card); + if (ret != 0) + return ret; + + /* deactivate pins to sleep state */ + for_each_card_rtds(card, rtd) { + struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; + int j; + + for_each_rtd_codec_dai(rtd, j, codec_dai) { + if (!codec_dai->active) + pinctrl_pm_select_sleep_state(codec_dai->dev); + } + + if (!cpu_dai->active) + pinctrl_pm_select_sleep_state(cpu_dai->dev); + } + + return ret; +} + /** * snd_soc_register_card - Register a card with the ASoC core * @@ -2702,13 +2733,12 @@ EXPORT_SYMBOL_GPL(snd_soc_dai_digital_mute); int snd_soc_register_card(struct snd_soc_card *card) { int i, ret; - struct snd_soc_pcm_runtime *rtd; + struct snd_soc_dai_link *link; if (!card->name || !card->dev) return -EINVAL; - for (i = 0; i < card->num_links; i++) { - struct snd_soc_dai_link *link = &card->dai_link[i]; + for_each_card_prelinks(card, i, link) { ret = soc_init_dai_link(card, link); if (ret) { @@ -2723,7 +2753,6 @@ int snd_soc_register_card(struct snd_soc_card *card) snd_soc_initialize_card_lists(card); INIT_LIST_HEAD(&card->dai_link_list); - card->num_dai_links = 0; INIT_LIST_HEAD(&card->rtd_list); card->num_rtd = 0; @@ -2734,28 +2763,23 @@ int snd_soc_register_card(struct snd_soc_card *card) mutex_init(&card->mutex); mutex_init(&card->dapm_mutex); - ret = snd_soc_instantiate_card(card); - if (ret != 0) - return ret; - - /* deactivate pins to sleep state */ - list_for_each_entry(rtd, &card->rtd_list, list) { - struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - int j; - - for (j = 0; j < rtd->num_codecs; j++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[j]; - if (!codec_dai->active) - pinctrl_pm_select_sleep_state(codec_dai->dev); - } + return snd_soc_bind_card(card); +} +EXPORT_SYMBOL_GPL(snd_soc_register_card); - if (!cpu_dai->active) - pinctrl_pm_select_sleep_state(cpu_dai->dev); +static void snd_soc_unbind_card(struct snd_soc_card *card, bool unregister) +{ + if (card->instantiated) { + card->instantiated = false; + snd_soc_dapm_shutdown(card); + soc_cleanup_card_resources(card); + if (!unregister) + list_add(&card->list, &unbind_card_list); + } else { + if (unregister) + list_del(&card->list); } - - return ret; } -EXPORT_SYMBOL_GPL(snd_soc_register_card); /** * snd_soc_unregister_card - Unregister a card with the ASoC core @@ -2765,12 +2789,8 @@ EXPORT_SYMBOL_GPL(snd_soc_register_card); */ int snd_soc_unregister_card(struct snd_soc_card *card) { - if (card->instantiated) { - card->instantiated = false; - snd_soc_dapm_shutdown(card); - soc_cleanup_card_resources(card); - dev_dbg(card->dev, "ASoC: Unregistered card '%s'\n", card->name); - } + snd_soc_unbind_card(card, true); + dev_dbg(card->dev, "ASoC: Unregistered card '%s'\n", card->name); return 0; } @@ -2802,7 +2822,7 @@ static char *fmt_single_name(struct device *dev, int *id) } } else { - /* I2C component devices are named "bus-addr" */ + /* I2C component devices are named "bus-addr" */ if (sscanf(name, "%x-%x", &id1, &id2) == 2) { char tmp[NAME_SIZE]; @@ -2810,7 +2830,8 @@ static char *fmt_single_name(struct device *dev, int *id) *id = ((id1 & 0xffff) << 16) + id2; /* sanitize component name for DAI link creation */ - snprintf(tmp, NAME_SIZE, "%s.%s", dev->driver->name, name); + snprintf(tmp, NAME_SIZE, "%s.%s", dev->driver->name, + name); strlcpy(name, tmp, NAME_SIZE); } else *id = 0; @@ -2845,7 +2866,7 @@ static void snd_soc_unregister_dais(struct snd_soc_component *component) { struct snd_soc_dai *dai, *_dai; - list_for_each_entry_safe(dai, _dai, &component->dai_list, list) { + for_each_component_dais_safe(component, dai, _dai) { dev_dbg(component->dev, "ASoC: Unregistered DAI '%s'\n", dai->name); list_del(&dai->list); @@ -2877,7 +2898,7 @@ static struct snd_soc_dai *soc_add_dai(struct snd_soc_component *component, * component-less anymore. */ if (legacy_dai_naming && - (dai_drv->id == 0 || dai_drv->name == NULL)) { + (dai_drv->id == 0 || dai_drv->name == NULL)) { dai->name = fmt_single_name(dev, &dai->id); } else { dai->name = fmt_multiple_name(dev, dai_drv); @@ -2897,6 +2918,7 @@ static struct snd_soc_dai *soc_add_dai(struct snd_soc_component *component, if (!dai->driver->ops) dai->driver->ops = &null_dai_ops; + /* see for_each_component_dais */ list_add_tail(&dai->list, &component->dai_list); component->num_dai++; @@ -2910,11 +2932,10 @@ static struct snd_soc_dai *soc_add_dai(struct snd_soc_component *component, * @component: The component the DAIs are registered for * @dai_drv: DAI driver to use for the DAIs * @count: Number of DAIs - * @legacy_dai_naming: Use the legacy naming scheme and let the DAI inherit the - * parent's name. */ static int snd_soc_register_dais(struct snd_soc_component *component, - struct snd_soc_dai_driver *dai_drv, size_t count) + struct snd_soc_dai_driver *dai_drv, + size_t count) { struct device *dev = component->dev; struct snd_soc_dai *dai; @@ -2925,8 +2946,8 @@ static int snd_soc_register_dais(struct snd_soc_component *component, for (i = 0; i < count; i++) { - dai = soc_add_dai(component, dai_drv + i, - count == 1 && !component->driver->non_legacy_dai_naming); + dai = soc_add_dai(component, dai_drv + i, count == 1 && + !component->driver->non_legacy_dai_naming); if (dai == NULL) { ret = -ENOMEM; goto err; @@ -2970,7 +2991,8 @@ int snd_soc_register_dai(struct snd_soc_component *component, if (!dai) return -ENOMEM; - /* Create the DAI widgets here. After adding DAIs, topology may + /* + * Create the DAI widgets here. After adding DAIs, topology may * also add routes that need these widgets as source or sink. */ ret = snd_soc_dapm_new_dai_widgets(dapm, dai); @@ -3052,7 +3074,8 @@ static void snd_soc_component_setup_regmap(struct snd_soc_component *component) #ifdef CONFIG_REGMAP /** - * snd_soc_component_init_regmap() - Initialize regmap instance for the component + * snd_soc_component_init_regmap() - Initialize regmap instance for the + * component * @component: The component for which to initialize the regmap instance * @regmap: The regmap instance that should be used by the component * @@ -3070,7 +3093,8 @@ void snd_soc_component_init_regmap(struct snd_soc_component *component, EXPORT_SYMBOL_GPL(snd_soc_component_init_regmap); /** - * snd_soc_component_exit_regmap() - De-initialize regmap instance for the component + * snd_soc_component_exit_regmap() - De-initialize regmap instance for the + * component * @component: The component for which to de-initialize the regmap instance * * Calls regmap_exit() on the regmap instance associated to the component and @@ -3094,11 +3118,13 @@ static void snd_soc_component_add(struct snd_soc_component *component) if (!component->driver->write && !component->driver->read) { if (!component->regmap) - component->regmap = dev_get_regmap(component->dev, NULL); + component->regmap = dev_get_regmap(component->dev, + NULL); if (component->regmap) snd_soc_component_setup_regmap(component); } + /* see for_each_component */ list_add(&component->list, &component_list); INIT_LIST_HEAD(&component->dobj_list); @@ -3116,7 +3142,7 @@ static void snd_soc_component_del_unlocked(struct snd_soc_component *component) struct snd_soc_card *card = component->card; if (card) - snd_soc_unregister_card(card); + snd_soc_unbind_card(card, false); list_del(&component->list); } @@ -3156,6 +3182,18 @@ static void convert_endianness_formats(struct snd_soc_pcm_stream *stream) stream->formats |= endianness_format_map[i]; } +static void snd_soc_try_rebind_card(void) +{ + struct snd_soc_card *card, *c; + + if (!list_empty(&unbind_card_list)) { + list_for_each_entry_safe(card, c, &unbind_card_list, list) { + if (!snd_soc_bind_card(card)) + list_del(&card->list); + } + } +} + int snd_soc_add_component(struct device *dev, struct snd_soc_component *component, const struct snd_soc_component_driver *component_driver, @@ -3183,6 +3221,7 @@ int snd_soc_add_component(struct device *dev, } snd_soc_component_add(component); + snd_soc_try_rebind_card(); return 0; @@ -3221,27 +3260,28 @@ static int __snd_soc_unregister_component(struct device *dev) int found = 0; mutex_lock(&client_mutex); - list_for_each_entry(component, &component_list, list) { + for_each_component(component) { if (dev != component->dev) continue; - snd_soc_tplg_component_remove(component, SND_SOC_TPLG_INDEX_ALL); + snd_soc_tplg_component_remove(component, + SND_SOC_TPLG_INDEX_ALL); snd_soc_component_del_unlocked(component); found = 1; break; } mutex_unlock(&client_mutex); - if (found) { + if (found) snd_soc_component_cleanup(component); - } return found; } void snd_soc_unregister_component(struct device *dev) { - while (__snd_soc_unregister_component(dev)); + while (__snd_soc_unregister_component(dev)) + ; } EXPORT_SYMBOL_GPL(snd_soc_unregister_component); @@ -3253,7 +3293,7 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev, ret = NULL; mutex_lock(&client_mutex); - list_for_each_entry(component, &component_list, list) { + for_each_component(component) { if (dev != component->dev) continue; @@ -3653,7 +3693,7 @@ int snd_soc_get_dai_id(struct device_node *ep) */ ret = -ENOTSUPP; mutex_lock(&client_mutex); - list_for_each_entry(pos, &component_list, list) { + for_each_component(pos) { struct device_node *component_of_node = pos->dev->of_node; if (!component_of_node && pos->dev->parent) @@ -3683,7 +3723,7 @@ int snd_soc_get_dai_name(struct of_phandle_args *args, int ret = -EPROBE_DEFER; mutex_lock(&client_mutex); - list_for_each_entry(pos, &component_list, list) { + for_each_component(pos) { component_of_node = pos->dev->of_node; if (!component_of_node && pos->dev->parent) component_of_node = pos->dev->parent->of_node; @@ -3719,7 +3759,7 @@ int snd_soc_get_dai_name(struct of_phandle_args *args, ret = 0; /* find target DAI */ - list_for_each_entry(dai, &pos->dai_list, list) { + for_each_component_dais(pos, dai) { if (id == 0) break; id--; @@ -3764,10 +3804,10 @@ EXPORT_SYMBOL_GPL(snd_soc_of_get_dai_name); */ void snd_soc_of_put_dai_link_codecs(struct snd_soc_dai_link *dai_link) { - struct snd_soc_dai_link_component *component = dai_link->codecs; + struct snd_soc_dai_link_component *component; int index; - for (index = 0; index < dai_link->num_codecs; index++, component++) { + for_each_link_codecs(dai_link, index, component) { if (!component->of_node) break; of_node_put(component->of_node); @@ -3819,12 +3859,10 @@ int snd_soc_of_get_dai_link_codecs(struct device *dev, dai_link->num_codecs = num_codecs; /* Parse the list */ - for (index = 0, component = dai_link->codecs; - index < dai_link->num_codecs; - index++, component++) { + for_each_link_codecs(dai_link, index, component) { ret = of_parse_phandle_with_args(of_node, name, "#sound-dai-cells", - index, &args); + index, &args); if (ret) goto err; component->of_node = args.np; diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 461d951917c0..a5178845065b 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -18,7 +18,6 @@ // device reopen. #include -#include #include #include #include @@ -364,10 +363,6 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, ret = PTR_ERR(data->widget); goto err_data; } - if (!data->widget) { - ret = -ENOMEM; - goto err_data; - } } break; case snd_soc_dapm_demux: @@ -402,10 +397,6 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, ret = PTR_ERR(data->widget); goto err_data; } - if (!data->widget) { - ret = -ENOMEM; - goto err_data; - } snd_soc_dapm_add_path(widget->dapm, data->widget, widget, NULL, NULL); @@ -1026,9 +1017,10 @@ static int dapm_new_dai_link(struct snd_soc_dapm_widget *w) struct snd_kcontrol *kcontrol; struct snd_soc_dapm_context *dapm = w->dapm; struct snd_card *card = dapm->card->snd_card; + struct snd_soc_pcm_runtime *rtd = w->priv; /* create control for links with > 1 config */ - if (w->num_params <= 1) + if (rtd->dai_link->num_params <= 1) return 0; /* add kcontrol */ @@ -1320,14 +1312,13 @@ int dapm_clock_event(struct snd_soc_dapm_widget *w, soc_dapm_async_complete(w->dapm); -#ifdef CONFIG_HAVE_CLK if (SND_SOC_DAPM_EVENT_ON(event)) { return clk_prepare_enable(w->clk); } else { clk_disable_unprepare(w->clk); return 0; } -#endif + return 0; } EXPORT_SYMBOL_GPL(dapm_clock_event); @@ -1953,7 +1944,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event) dapm_pre_sequence_async(&card->dapm, 0); /* Run other bias changes in parallel */ list_for_each_entry(d, &card->dapm_list, list) { - if (d != &card->dapm) + if (d != &card->dapm && d->bias_level != d->target_bias_level) async_schedule_domain(dapm_pre_sequence_async, d, &async_domain); } @@ -1977,7 +1968,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event) /* Run all the bias changes in parallel */ list_for_each_entry(d, &card->dapm_list, list) { - if (d != &card->dapm) + if (d != &card->dapm && d->bias_level != d->target_bias_level) async_schedule_domain(dapm_post_sequence_async, d, &async_domain); } @@ -2371,12 +2362,13 @@ static ssize_t dapm_widget_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_soc_pcm_runtime *rtd = dev_get_drvdata(dev); + struct snd_soc_dai *codec_dai; int i, count = 0; mutex_lock(&rtd->card->dapm_mutex); - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_component *cmpnt = rtd->codec_dais[i]->component; + for_each_rtd_codec_dai(rtd, i, codec_dai) { + struct snd_soc_component *cmpnt = codec_dai->component; count += dapm_widget_show_component(cmpnt, buf + count); } @@ -3425,35 +3417,6 @@ int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol, } EXPORT_SYMBOL_GPL(snd_soc_dapm_put_pin_switch); -struct snd_soc_dapm_widget * -snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm, - const struct snd_soc_dapm_widget *widget) -{ - struct snd_soc_dapm_widget *w; - - mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); - w = snd_soc_dapm_new_control_unlocked(dapm, widget); - /* Do not nag about probe deferrals */ - if (IS_ERR(w)) { - int ret = PTR_ERR(w); - - if (ret != -EPROBE_DEFER) - dev_err(dapm->dev, - "ASoC: Failed to create DAPM control %s (%d)\n", - widget->name, ret); - goto out_unlock; - } - if (!w) - dev_err(dapm->dev, - "ASoC: Failed to create DAPM control %s\n", - widget->name); - -out_unlock: - mutex_unlock(&dapm->card->dapm_mutex); - return w; -} -EXPORT_SYMBOL_GPL(snd_soc_dapm_new_control); - struct snd_soc_dapm_widget * snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, const struct snd_soc_dapm_widget *widget) @@ -3464,53 +3427,37 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, int ret; if ((w = dapm_cnew_widget(widget)) == NULL) - return NULL; + return ERR_PTR(-ENOMEM); switch (w->id) { case snd_soc_dapm_regulator_supply: w->regulator = devm_regulator_get(dapm->dev, w->name); if (IS_ERR(w->regulator)) { ret = PTR_ERR(w->regulator); - if (ret == -EPROBE_DEFER) - return ERR_PTR(ret); - dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", - w->name, ret); - return NULL; + goto request_failed; } if (w->on_val & SND_SOC_DAPM_REGULATOR_BYPASS) { ret = regulator_allow_bypass(w->regulator, true); if (ret != 0) - dev_warn(w->dapm->dev, + dev_warn(dapm->dev, "ASoC: Failed to bypass %s: %d\n", w->name, ret); } break; case snd_soc_dapm_pinctrl: w->pinctrl = devm_pinctrl_get(dapm->dev); - if (IS_ERR_OR_NULL(w->pinctrl)) { + if (IS_ERR(w->pinctrl)) { ret = PTR_ERR(w->pinctrl); - if (ret == -EPROBE_DEFER) - return ERR_PTR(ret); - dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", - w->name, ret); - return NULL; + goto request_failed; } break; case snd_soc_dapm_clock_supply: -#ifdef CONFIG_CLKDEV_LOOKUP w->clk = devm_clk_get(dapm->dev, w->name); if (IS_ERR(w->clk)) { ret = PTR_ERR(w->clk); - if (ret == -EPROBE_DEFER) - return ERR_PTR(ret); - dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", - w->name, ret); - return NULL; + goto request_failed; } -#else - return NULL; -#endif break; default: break; @@ -3523,7 +3470,7 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, w->name = kstrdup_const(widget->name, GFP_KERNEL); if (w->name == NULL) { kfree(w); - return NULL; + return ERR_PTR(-ENOMEM); } switch (w->id) { @@ -3600,7 +3547,37 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, /* machine layer sets up unconnected pins and insertions */ w->connected = 1; return w; + +request_failed: + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", + w->name, ret); + + return ERR_PTR(ret); +} + +/** + * snd_soc_dapm_new_control - create new dapm control + * @dapm: DAPM context + * @widget: widget template + * + * Creates new DAPM control based upon a template. + * + * Returns a widget pointer on success or an error pointer on failure + */ +struct snd_soc_dapm_widget * +snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm, + const struct snd_soc_dapm_widget *widget) +{ + struct snd_soc_dapm_widget *w; + + mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); + w = snd_soc_dapm_new_control_unlocked(dapm, widget); + mutex_unlock(&dapm->card->dapm_mutex); + + return w; } +EXPORT_SYMBOL_GPL(snd_soc_dapm_new_control); /** * snd_soc_dapm_new_controls - create new dapm controls @@ -3625,19 +3602,6 @@ int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm, w = snd_soc_dapm_new_control_unlocked(dapm, widget); if (IS_ERR(w)) { ret = PTR_ERR(w); - /* Do not nag about probe deferrals */ - if (ret == -EPROBE_DEFER) - break; - dev_err(dapm->dev, - "ASoC: Failed to create DAPM control %s (%d)\n", - widget->name, ret); - break; - } - if (!w) { - dev_err(dapm->dev, - "ASoC: Failed to create DAPM control %s\n", - widget->name); - ret = -ENOMEM; break; } widget++; @@ -3650,32 +3614,23 @@ EXPORT_SYMBOL_GPL(snd_soc_dapm_new_controls); static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) { - struct snd_soc_dapm_path *source_p, *sink_p; + struct snd_soc_dapm_path *path; struct snd_soc_dai *source, *sink; struct snd_soc_pcm_runtime *rtd = w->priv; - const struct snd_soc_pcm_stream *config = w->params + w->params_select; + const struct snd_soc_pcm_stream *config; struct snd_pcm_substream substream; struct snd_pcm_hw_params *params = NULL; struct snd_pcm_runtime *runtime = NULL; unsigned int fmt; - int ret; + int ret = 0; + + config = rtd->dai_link->params + rtd->params_select; if (WARN_ON(!config) || WARN_ON(list_empty(&w->edges[SND_SOC_DAPM_DIR_OUT]) || list_empty(&w->edges[SND_SOC_DAPM_DIR_IN]))) return -EINVAL; - /* We only support a single source and sink, pick the first */ - source_p = list_first_entry(&w->edges[SND_SOC_DAPM_DIR_OUT], - struct snd_soc_dapm_path, - list_node[SND_SOC_DAPM_DIR_OUT]); - sink_p = list_first_entry(&w->edges[SND_SOC_DAPM_DIR_IN], - struct snd_soc_dapm_path, - list_node[SND_SOC_DAPM_DIR_IN]); - - source = source_p->source->priv; - sink = sink_p->sink->priv; - /* Be a little careful as we don't want to overflow the mask array */ if (config->formats) { fmt = ffs(config->formats) - 1; @@ -3717,59 +3672,95 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, switch (event) { case SND_SOC_DAPM_PRE_PMU: substream.stream = SNDRV_PCM_STREAM_CAPTURE; - if (source->driver->ops->startup) { - ret = source->driver->ops->startup(&substream, source); - if (ret < 0) { - dev_err(source->dev, - "ASoC: startup() failed: %d\n", ret); - goto out; + snd_soc_dapm_widget_for_each_source_path(w, path) { + source = path->source->priv; + + if (source->driver->ops->startup) { + ret = source->driver->ops->startup(&substream, + source); + if (ret < 0) { + dev_err(source->dev, + "ASoC: startup() failed: %d\n", + ret); + goto out; + } + source->active++; } - source->active++; + ret = soc_dai_hw_params(&substream, params, source); + if (ret < 0) + goto out; } - ret = soc_dai_hw_params(&substream, params, source); - if (ret < 0) - goto out; substream.stream = SNDRV_PCM_STREAM_PLAYBACK; - if (sink->driver->ops->startup) { - ret = sink->driver->ops->startup(&substream, sink); - if (ret < 0) { - dev_err(sink->dev, - "ASoC: startup() failed: %d\n", ret); - goto out; + snd_soc_dapm_widget_for_each_sink_path(w, path) { + sink = path->sink->priv; + + if (sink->driver->ops->startup) { + ret = sink->driver->ops->startup(&substream, + sink); + if (ret < 0) { + dev_err(sink->dev, + "ASoC: startup() failed: %d\n", + ret); + goto out; + } + sink->active++; } - sink->active++; + ret = soc_dai_hw_params(&substream, params, sink); + if (ret < 0) + goto out; } - ret = soc_dai_hw_params(&substream, params, sink); - if (ret < 0) - goto out; break; case SND_SOC_DAPM_POST_PMU: - ret = snd_soc_dai_digital_mute(sink, 0, - SNDRV_PCM_STREAM_PLAYBACK); - if (ret != 0 && ret != -ENOTSUPP) - dev_warn(sink->dev, "ASoC: Failed to unmute: %d\n", ret); - ret = 0; + snd_soc_dapm_widget_for_each_sink_path(w, path) { + sink = path->sink->priv; + + ret = snd_soc_dai_digital_mute(sink, 0, + SNDRV_PCM_STREAM_PLAYBACK); + if (ret != 0 && ret != -ENOTSUPP) + dev_warn(sink->dev, + "ASoC: Failed to unmute: %d\n", ret); + ret = 0; + } break; case SND_SOC_DAPM_PRE_PMD: - ret = snd_soc_dai_digital_mute(sink, 1, - SNDRV_PCM_STREAM_PLAYBACK); - if (ret != 0 && ret != -ENOTSUPP) - dev_warn(sink->dev, "ASoC: Failed to mute: %d\n", ret); - ret = 0; + snd_soc_dapm_widget_for_each_sink_path(w, path) { + sink = path->sink->priv; + + ret = snd_soc_dai_digital_mute(sink, 1, + SNDRV_PCM_STREAM_PLAYBACK); + if (ret != 0 && ret != -ENOTSUPP) + dev_warn(sink->dev, + "ASoC: Failed to mute: %d\n", ret); + ret = 0; + } - source->active--; - if (source->driver->ops->shutdown) { - substream.stream = SNDRV_PCM_STREAM_CAPTURE; - source->driver->ops->shutdown(&substream, source); + substream.stream = SNDRV_PCM_STREAM_CAPTURE; + snd_soc_dapm_widget_for_each_source_path(w, path) { + source = path->source->priv; + + if (source->driver->ops->hw_free) + source->driver->ops->hw_free(&substream, + source); + + source->active--; + if (source->driver->ops->shutdown) + source->driver->ops->shutdown(&substream, + source); } - sink->active--; - if (sink->driver->ops->shutdown) { - substream.stream = SNDRV_PCM_STREAM_PLAYBACK; - sink->driver->ops->shutdown(&substream, sink); + substream.stream = SNDRV_PCM_STREAM_PLAYBACK; + snd_soc_dapm_widget_for_each_sink_path(w, path) { + sink = path->sink->priv; + + if (sink->driver->ops->hw_free) + sink->driver->ops->hw_free(&substream, sink); + + sink->active--; + if (sink->driver->ops->shutdown) + sink->driver->ops->shutdown(&substream, sink); } break; @@ -3788,8 +3779,9 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol); + struct snd_soc_pcm_runtime *rtd = w->priv; - ucontrol->value.enumerated.item[0] = w->params_select; + ucontrol->value.enumerated.item[0] = rtd->params_select; return 0; } @@ -3798,18 +3790,19 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol); + struct snd_soc_pcm_runtime *rtd = w->priv; /* Can't change the config when widget is already powered */ if (w->power) return -EBUSY; - if (ucontrol->value.enumerated.item[0] == w->params_select) + if (ucontrol->value.enumerated.item[0] == rtd->params_select) return 0; - if (ucontrol->value.enumerated.item[0] >= w->num_params) + if (ucontrol->value.enumerated.item[0] >= rtd->dai_link->num_params) return -EINVAL; - w->params_select = ucontrol->value.enumerated.item[0]; + rtd->params_select = ucontrol->value.enumerated.item[0]; return 0; } @@ -3896,12 +3889,10 @@ outfree_w_param: return NULL; } -int snd_soc_dapm_new_pcm(struct snd_soc_card *card, - struct snd_soc_pcm_runtime *rtd, - const struct snd_soc_pcm_stream *params, - unsigned int num_params, - struct snd_soc_dapm_widget *source, - struct snd_soc_dapm_widget *sink) +static struct snd_soc_dapm_widget * +snd_soc_dapm_new_dai(struct snd_soc_card *card, struct snd_soc_pcm_runtime *rtd, + struct snd_soc_dapm_widget *source, + struct snd_soc_dapm_widget *sink) { struct snd_soc_dapm_widget template; struct snd_soc_dapm_widget *w; @@ -3913,7 +3904,7 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, link_name = devm_kasprintf(card->dev, GFP_KERNEL, "%s-%s", source->name, sink->name); if (!link_name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&template, 0, sizeof(template)); template.reg = SND_SOC_NOPM; @@ -3925,9 +3916,10 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, template.kcontrol_news = NULL; /* allocate memory for control, only in case of multiple configs */ - if (num_params > 1) { - w_param_text = devm_kcalloc(card->dev, num_params, - sizeof(char *), GFP_KERNEL); + if (rtd->dai_link->num_params > 1) { + w_param_text = devm_kcalloc(card->dev, + rtd->dai_link->num_params, + sizeof(char *), GFP_KERNEL); if (!w_param_text) { ret = -ENOMEM; goto param_fail; @@ -3936,7 +3928,9 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, template.num_kcontrols = 1; template.kcontrol_news = snd_soc_dapm_alloc_kcontrol(card, - link_name, params, num_params, + link_name, + rtd->dai_link->params, + rtd->dai_link->num_params, w_param_text, &private_value); if (!template.kcontrol_news) { ret = -ENOMEM; @@ -3950,37 +3944,20 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, w = snd_soc_dapm_new_control_unlocked(&card->dapm, &template); if (IS_ERR(w)) { ret = PTR_ERR(w); - /* Do not nag about probe deferrals */ - if (ret != -EPROBE_DEFER) - dev_err(card->dev, - "ASoC: Failed to create %s widget (%d)\n", - link_name, ret); - goto outfree_kcontrol_news; - } - if (!w) { - dev_err(card->dev, "ASoC: Failed to create %s widget\n", - link_name); - ret = -ENOMEM; goto outfree_kcontrol_news; } - w->params = params; - w->num_params = num_params; w->priv = rtd; - ret = snd_soc_dapm_add_path(&card->dapm, source, w, NULL, NULL); - if (ret) - goto outfree_w; - return snd_soc_dapm_add_path(&card->dapm, w, sink, NULL, NULL); + return w; -outfree_w: - devm_kfree(card->dev, w); outfree_kcontrol_news: devm_kfree(card->dev, (void *)template.kcontrol_news); - snd_soc_dapm_free_kcontrol(card, &private_value, num_params, w_param_text); + snd_soc_dapm_free_kcontrol(card, &private_value, + rtd->dai_link->num_params, w_param_text); param_fail: devm_kfree(card->dev, link_name); - return ret; + return ERR_PTR(ret); } int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, @@ -4003,21 +3980,8 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); - if (IS_ERR(w)) { - int ret = PTR_ERR(w); - - /* Do not nag about probe deferrals */ - if (ret != -EPROBE_DEFER) - dev_err(dapm->dev, - "ASoC: Failed to create %s widget (%d)\n", - dai->driver->playback.stream_name, ret); - return ret; - } - if (!w) { - dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", - dai->driver->playback.stream_name); - return -ENOMEM; - } + if (IS_ERR(w)) + return PTR_ERR(w); w->priv = dai; dai->playback_widget = w; @@ -4032,21 +3996,8 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); - if (IS_ERR(w)) { - int ret = PTR_ERR(w); - - /* Do not nag about probe deferrals */ - if (ret != -EPROBE_DEFER) - dev_err(dapm->dev, - "ASoC: Failed to create %s widget (%d)\n", - dai->driver->playback.stream_name, ret); - return ret; - } - if (!w) { - dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", - dai->driver->capture.stream_name); - return -ENOMEM; - } + if (IS_ERR(w)) + return PTR_ERR(w); w->priv = dai; dai->capture_widget = w; @@ -4115,34 +4066,79 @@ static void dapm_connect_dai_link_widgets(struct snd_soc_card *card, struct snd_soc_pcm_runtime *rtd) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - struct snd_soc_dapm_widget *sink, *source; + struct snd_soc_dai *codec_dai; + struct snd_soc_dapm_widget *playback = NULL, *capture = NULL; + struct snd_soc_dapm_widget *codec, *playback_cpu, *capture_cpu; int i; - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; + if (rtd->dai_link->params) { + playback_cpu = cpu_dai->capture_widget; + capture_cpu = cpu_dai->playback_widget; + } else { + playback = cpu_dai->playback_widget; + capture = cpu_dai->capture_widget; + playback_cpu = playback; + capture_cpu = capture; + } + + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* connect BE DAI playback if widgets are valid */ - if (codec_dai->playback_widget && cpu_dai->playback_widget) { - source = cpu_dai->playback_widget; - sink = codec_dai->playback_widget; + codec = codec_dai->playback_widget; + + if (playback_cpu && codec) { + if (!playback) { + playback = snd_soc_dapm_new_dai(card, rtd, + playback_cpu, + codec); + if (IS_ERR(playback)) { + dev_err(rtd->dev, + "ASoC: Failed to create DAI %s: %ld\n", + codec_dai->name, + PTR_ERR(playback)); + continue; + } + + snd_soc_dapm_add_path(&card->dapm, playback_cpu, + playback, NULL, NULL); + } + dev_dbg(rtd->dev, "connected DAI link %s:%s -> %s:%s\n", - cpu_dai->component->name, source->name, - codec_dai->component->name, sink->name); + cpu_dai->component->name, playback_cpu->name, + codec_dai->component->name, codec->name); - snd_soc_dapm_add_path(&card->dapm, source, sink, - NULL, NULL); + snd_soc_dapm_add_path(&card->dapm, playback, codec, + NULL, NULL); } + } + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* connect BE DAI capture if widgets are valid */ - if (codec_dai->capture_widget && cpu_dai->capture_widget) { - source = codec_dai->capture_widget; - sink = cpu_dai->capture_widget; + codec = codec_dai->capture_widget; + + if (codec && capture_cpu) { + if (!capture) { + capture = snd_soc_dapm_new_dai(card, rtd, + codec, + capture_cpu); + if (IS_ERR(capture)) { + dev_err(rtd->dev, + "ASoC: Failed to create DAI %s: %ld\n", + codec_dai->name, + PTR_ERR(capture)); + continue; + } + + snd_soc_dapm_add_path(&card->dapm, capture, + capture_cpu, NULL, NULL); + } + dev_dbg(rtd->dev, "connected DAI link %s:%s -> %s:%s\n", - codec_dai->component->name, source->name, - cpu_dai->component->name, sink->name); + codec_dai->component->name, codec->name, + cpu_dai->component->name, capture_cpu->name); - snd_soc_dapm_add_path(&card->dapm, source, sink, - NULL, NULL); + snd_soc_dapm_add_path(&card->dapm, codec, capture, + NULL, NULL); } } } @@ -4192,12 +4188,12 @@ void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card) struct snd_soc_pcm_runtime *rtd; /* for each BE DAI link... */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { /* * dynamic FE links have no fixed DAI mapping. * CODEC<->CODEC links have no direct connection. */ - if (rtd->dai_link->dynamic || rtd->dai_link->params) + if (rtd->dai_link->dynamic) continue; dapm_connect_dai_link_widgets(card, rtd); @@ -4207,11 +4203,12 @@ void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card) static void soc_dapm_stream_event(struct snd_soc_pcm_runtime *rtd, int stream, int event) { + struct snd_soc_dai *codec_dai; int i; soc_dapm_dai_stream_event(rtd->cpu_dai, stream, event); - for (i = 0; i < rtd->num_codecs; i++) - soc_dapm_dai_stream_event(rtd->codec_dais[i], stream, event); + for_each_rtd_codec_dai(rtd, i, codec_dai) + soc_dapm_dai_stream_event(codec_dai, stream, event); dapm_power_widgets(rtd->card, event); } diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 592efb370c44..f4dc3d445aae 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -373,7 +373,7 @@ int snd_soc_get_volsw_sx(struct snd_kcontrol *kcontrol, unsigned int rshift = mc->rshift; int max = mc->max; int min = mc->min; - unsigned int mask = (1 << (fls(min + max) - 1)) - 1; + unsigned int mask = (1U << (fls(min + max) - 1)) - 1; unsigned int val; int ret; @@ -418,7 +418,7 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, unsigned int rshift = mc->rshift; int max = mc->max; int min = mc->min; - unsigned int mask = (1 << (fls(min + max) - 1)) - 1; + unsigned int mask = (1U << (fls(min + max) - 1)) - 1; int err = 0; unsigned int val, val_mask, val2 = 0; diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index e8b98bfd4cf1..03f36e534050 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -59,25 +59,26 @@ static bool snd_soc_dai_stream_valid(struct snd_soc_dai *dai, int stream) void snd_soc_runtime_activate(struct snd_soc_pcm_runtime *rtd, int stream) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; int i; lockdep_assert_held(&rtd->pcm_mutex); if (stream == SNDRV_PCM_STREAM_PLAYBACK) { cpu_dai->playback_active++; - for (i = 0; i < rtd->num_codecs; i++) - rtd->codec_dais[i]->playback_active++; + for_each_rtd_codec_dai(rtd, i, codec_dai) + codec_dai->playback_active++; } else { cpu_dai->capture_active++; - for (i = 0; i < rtd->num_codecs; i++) - rtd->codec_dais[i]->capture_active++; + for_each_rtd_codec_dai(rtd, i, codec_dai) + codec_dai->capture_active++; } cpu_dai->active++; cpu_dai->component->active++; - for (i = 0; i < rtd->num_codecs; i++) { - rtd->codec_dais[i]->active++; - rtd->codec_dais[i]->component->active++; + for_each_rtd_codec_dai(rtd, i, codec_dai) { + codec_dai->active++; + codec_dai->component->active++; } } @@ -94,25 +95,26 @@ void snd_soc_runtime_activate(struct snd_soc_pcm_runtime *rtd, int stream) void snd_soc_runtime_deactivate(struct snd_soc_pcm_runtime *rtd, int stream) { struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; int i; lockdep_assert_held(&rtd->pcm_mutex); if (stream == SNDRV_PCM_STREAM_PLAYBACK) { cpu_dai->playback_active--; - for (i = 0; i < rtd->num_codecs; i++) - rtd->codec_dais[i]->playback_active--; + for_each_rtd_codec_dai(rtd, i, codec_dai) + codec_dai->playback_active--; } else { cpu_dai->capture_active--; - for (i = 0; i < rtd->num_codecs; i++) - rtd->codec_dais[i]->capture_active--; + for_each_rtd_codec_dai(rtd, i, codec_dai) + codec_dai->capture_active--; } cpu_dai->active--; cpu_dai->component->active--; - for (i = 0; i < rtd->num_codecs; i++) { - rtd->codec_dais[i]->component->active--; - rtd->codec_dais[i]->active--; + for_each_rtd_codec_dai(rtd, i, codec_dai) { + codec_dai->component->active--; + codec_dai->active--; } } @@ -172,7 +174,7 @@ int dpcm_dapm_stream_event(struct snd_soc_pcm_runtime *fe, int dir, { struct snd_soc_dpcm *dpcm; - list_for_each_entry(dpcm, &fe->dpcm[dir].be_clients, list_be) { + for_each_dpcm_be(fe, dir, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; @@ -253,6 +255,7 @@ static int soc_pcm_params_symmetry(struct snd_pcm_substream *substream, { struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; unsigned int rate, channels, sample_bits, symmetry, i; rate = params_rate(params); @@ -263,8 +266,8 @@ static int soc_pcm_params_symmetry(struct snd_pcm_substream *substream, symmetry = cpu_dai->driver->symmetric_rates || rtd->dai_link->symmetric_rates; - for (i = 0; i < rtd->num_codecs; i++) - symmetry |= rtd->codec_dais[i]->driver->symmetric_rates; + for_each_rtd_codec_dai(rtd, i, codec_dai) + symmetry |= codec_dai->driver->symmetric_rates; if (symmetry && cpu_dai->rate && cpu_dai->rate != rate) { dev_err(rtd->dev, "ASoC: unmatched rate symmetry: %d - %d\n", @@ -275,8 +278,8 @@ static int soc_pcm_params_symmetry(struct snd_pcm_substream *substream, symmetry = cpu_dai->driver->symmetric_channels || rtd->dai_link->symmetric_channels; - for (i = 0; i < rtd->num_codecs; i++) - symmetry |= rtd->codec_dais[i]->driver->symmetric_channels; + for_each_rtd_codec_dai(rtd, i, codec_dai) + symmetry |= codec_dai->driver->symmetric_channels; if (symmetry && cpu_dai->channels && cpu_dai->channels != channels) { dev_err(rtd->dev, "ASoC: unmatched channel symmetry: %d - %d\n", @@ -287,8 +290,8 @@ static int soc_pcm_params_symmetry(struct snd_pcm_substream *substream, symmetry = cpu_dai->driver->symmetric_samplebits || rtd->dai_link->symmetric_samplebits; - for (i = 0; i < rtd->num_codecs; i++) - symmetry |= rtd->codec_dais[i]->driver->symmetric_samplebits; + for_each_rtd_codec_dai(rtd, i, codec_dai) + symmetry |= codec_dai->driver->symmetric_samplebits; if (symmetry && cpu_dai->sample_bits && cpu_dai->sample_bits != sample_bits) { dev_err(rtd->dev, "ASoC: unmatched sample bits symmetry: %d - %d\n", @@ -304,17 +307,18 @@ static bool soc_pcm_has_symmetry(struct snd_pcm_substream *substream) struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_dai_driver *cpu_driver = rtd->cpu_dai->driver; struct snd_soc_dai_link *link = rtd->dai_link; + struct snd_soc_dai *codec_dai; unsigned int symmetry, i; symmetry = cpu_driver->symmetric_rates || link->symmetric_rates || cpu_driver->symmetric_channels || link->symmetric_channels || cpu_driver->symmetric_samplebits || link->symmetric_samplebits; - for (i = 0; i < rtd->num_codecs; i++) + for_each_rtd_codec_dai(rtd, i, codec_dai) symmetry = symmetry || - rtd->codec_dais[i]->driver->symmetric_rates || - rtd->codec_dais[i]->driver->symmetric_channels || - rtd->codec_dais[i]->driver->symmetric_samplebits; + codec_dai->driver->symmetric_rates || + codec_dai->driver->symmetric_channels || + codec_dai->driver->symmetric_samplebits; return symmetry; } @@ -342,8 +346,7 @@ static void soc_pcm_apply_msb(struct snd_pcm_substream *substream) unsigned int bits = 0, cpu_bits; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->playback.sig_bits == 0) { bits = 0; break; @@ -352,8 +355,7 @@ static void soc_pcm_apply_msb(struct snd_pcm_substream *substream) } cpu_bits = cpu_dai->driver->playback.sig_bits; } else { - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->capture.sig_bits == 0) { bits = 0; break; @@ -372,6 +374,7 @@ static void soc_pcm_init_runtime_hw(struct snd_pcm_substream *substream) struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hardware *hw = &runtime->hw; struct snd_soc_pcm_runtime *rtd = substream->private_data; + struct snd_soc_dai *codec_dai; struct snd_soc_dai_driver *cpu_dai_drv = rtd->cpu_dai->driver; struct snd_soc_dai_driver *codec_dai_drv; struct snd_soc_pcm_stream *codec_stream; @@ -388,7 +391,7 @@ static void soc_pcm_init_runtime_hw(struct snd_pcm_substream *substream) cpu_stream = &cpu_dai_drv->capture; /* first calculate min/max only for CODECs in the DAI link */ - for (i = 0; i < rtd->num_codecs; i++) { + for_each_rtd_codec_dai(rtd, i, codec_dai) { /* * Skip CODECs which don't support the current stream type. @@ -399,11 +402,11 @@ static void soc_pcm_init_runtime_hw(struct snd_pcm_substream *substream) * bailed out on a higher level, since there would be no * CODEC to support the transfer direction in that case. */ - if (!snd_soc_dai_stream_valid(rtd->codec_dais[i], + if (!snd_soc_dai_stream_valid(codec_dai, substream->stream)) continue; - codec_dai_drv = rtd->codec_dais[i]->driver; + codec_dai_drv = codec_dai->driver; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) codec_stream = &codec_dai_drv->playback; else @@ -482,8 +485,8 @@ static int soc_pcm_open(struct snd_pcm_substream *substream) int i, ret = 0; pinctrl_pm_select_default_state(cpu_dai->dev); - for (i = 0; i < rtd->num_codecs; i++) - pinctrl_pm_select_default_state(rtd->codec_dais[i]->dev); + for_each_rtd_codec_dai(rtd, i, codec_dai) + pinctrl_pm_select_default_state(codec_dai->dev); for_each_rtdcom(rtd, rtdcom) { component = rtdcom->component; @@ -520,8 +523,7 @@ static int soc_pcm_open(struct snd_pcm_substream *substream) } component = NULL; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->startup) { ret = codec_dai->driver->ops->startup(substream, codec_dai); @@ -588,10 +590,9 @@ static int soc_pcm_open(struct snd_pcm_substream *substream) goto config_err; } - for (i = 0; i < rtd->num_codecs; i++) { - if (rtd->codec_dais[i]->active) { - ret = soc_pcm_apply_symmetry(substream, - rtd->codec_dais[i]); + for_each_rtd_codec_dai(rtd, i, codec_dai) { + if (codec_dai->active) { + ret = soc_pcm_apply_symmetry(substream, codec_dai); if (ret != 0) goto config_err; } @@ -620,8 +621,7 @@ machine_err: i = rtd->num_codecs; codec_dai_err: - while (--i >= 0) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai_rollback(rtd, i, codec_dai) { if (codec_dai->driver->ops->shutdown) codec_dai->driver->ops->shutdown(substream, codec_dai); } @@ -641,9 +641,9 @@ out: pm_runtime_put_autosuspend(component->dev); } - for (i = 0; i < rtd->num_codecs; i++) { - if (!rtd->codec_dais[i]->active) - pinctrl_pm_select_sleep_state(rtd->codec_dais[i]->dev); + for_each_rtd_codec_dai(rtd, i, codec_dai) { + if (!codec_dai->active) + pinctrl_pm_select_sleep_state(codec_dai->dev); } if (!cpu_dai->active) pinctrl_pm_select_sleep_state(cpu_dai->dev); @@ -701,8 +701,7 @@ static int soc_pcm_close(struct snd_pcm_substream *substream) if (!cpu_dai->active) cpu_dai->rate = 0; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (!codec_dai->active) codec_dai->rate = 0; } @@ -712,8 +711,7 @@ static int soc_pcm_close(struct snd_pcm_substream *substream) if (cpu_dai->driver->ops->shutdown) cpu_dai->driver->ops->shutdown(substream, cpu_dai); - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->shutdown) codec_dai->driver->ops->shutdown(substream, codec_dai); } @@ -751,9 +749,9 @@ static int soc_pcm_close(struct snd_pcm_substream *substream) pm_runtime_put_autosuspend(component->dev); } - for (i = 0; i < rtd->num_codecs; i++) { - if (!rtd->codec_dais[i]->active) - pinctrl_pm_select_sleep_state(rtd->codec_dais[i]->dev); + for_each_rtd_codec_dai(rtd, i, codec_dai) { + if (!codec_dai->active) + pinctrl_pm_select_sleep_state(codec_dai->dev); } if (!cpu_dai->active) pinctrl_pm_select_sleep_state(cpu_dai->dev); @@ -801,8 +799,7 @@ static int soc_pcm_prepare(struct snd_pcm_substream *substream) } } - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->prepare) { ret = codec_dai->driver->ops->prepare(substream, codec_dai); @@ -834,8 +831,8 @@ static int soc_pcm_prepare(struct snd_pcm_substream *substream) snd_soc_dapm_stream_event(rtd, substream->stream, SND_SOC_DAPM_STREAM_START); - for (i = 0; i < rtd->num_codecs; i++) - snd_soc_dai_digital_mute(rtd->codec_dais[i], 0, + for_each_rtd_codec_dai(rtd, i, codec_dai) + snd_soc_dai_digital_mute(codec_dai, 0, substream->stream); snd_soc_dai_digital_mute(cpu_dai, 0, substream->stream); @@ -920,6 +917,7 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_soc_component *component; struct snd_soc_rtdcom_list *rtdcom; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; + struct snd_soc_dai *codec_dai; int i, ret = 0; mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass); @@ -932,8 +930,7 @@ static int soc_pcm_hw_params(struct snd_pcm_substream *substream, } } - for (i = 0; i < rtd->num_codecs; i++) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { struct snd_pcm_hw_params codec_params; /* @@ -1018,8 +1015,7 @@ interface_err: i = rtd->num_codecs; codec_err: - while (--i >= 0) { - struct snd_soc_dai *codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai_rollback(rtd, i, codec_dai) { if (codec_dai->driver->ops->hw_free) codec_dai->driver->ops->hw_free(substream, codec_dai); codec_dai->rate = 0; @@ -1052,8 +1048,7 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream) cpu_dai->sample_bits = 0; } - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->active == 1) { codec_dai->rate = 0; codec_dai->channels = 0; @@ -1062,10 +1057,10 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream) } /* apply codec digital mute */ - for (i = 0; i < rtd->num_codecs; i++) { - if ((playback && rtd->codec_dais[i]->playback_active == 1) || - (!playback && rtd->codec_dais[i]->capture_active == 1)) - snd_soc_dai_digital_mute(rtd->codec_dais[i], 1, + for_each_rtd_codec_dai(rtd, i, codec_dai) { + if ((playback && codec_dai->playback_active == 1) || + (!playback && codec_dai->capture_active == 1)) + snd_soc_dai_digital_mute(codec_dai, 1, substream->stream); } @@ -1077,8 +1072,7 @@ static int soc_pcm_hw_free(struct snd_pcm_substream *substream) soc_pcm_components_hw_free(substream, NULL); /* now free hw params for the DAIs */ - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->hw_free) codec_dai->driver->ops->hw_free(substream, codec_dai); } @@ -1099,8 +1093,7 @@ static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd) struct snd_soc_dai *codec_dai; int i, ret; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->trigger) { ret = codec_dai->driver->ops->trigger(substream, cmd, codec_dai); @@ -1144,8 +1137,7 @@ static int soc_pcm_bespoke_trigger(struct snd_pcm_substream *substream, struct snd_soc_dai *codec_dai; int i, ret; - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->bespoke_trigger) { ret = codec_dai->driver->ops->bespoke_trigger(substream, cmd, codec_dai); @@ -1199,8 +1191,7 @@ static snd_pcm_uframes_t soc_pcm_pointer(struct snd_pcm_substream *substream) if (cpu_dai->driver->ops->delay) delay += cpu_dai->driver->ops->delay(substream, cpu_dai); - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->ops->delay) codec_delay = max(codec_delay, codec_dai->driver->ops->delay(substream, @@ -1220,7 +1211,7 @@ static int dpcm_be_connect(struct snd_soc_pcm_runtime *fe, struct snd_soc_dpcm *dpcm; /* only add new dpcms */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { if (dpcm->be == be && dpcm->fe == fe) return 0; } @@ -1261,7 +1252,7 @@ static void dpcm_be_reparent(struct snd_soc_pcm_runtime *fe, be_substream = snd_soc_dpcm_get_substream(be, stream); - list_for_each_entry(dpcm, &be->dpcm[stream].fe_clients, list_fe) { + for_each_dpcm_fe(be, stream, dpcm) { if (dpcm->fe == fe) continue; @@ -1281,7 +1272,7 @@ void dpcm_be_disconnect(struct snd_soc_pcm_runtime *fe, int stream) { struct snd_soc_dpcm *dpcm, *d; - list_for_each_entry_safe(dpcm, d, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be_safe(fe, stream, dpcm, d) { dev_dbg(fe->dev, "ASoC: BE %s disconnect check for %s\n", stream ? "capture" : "playback", dpcm->be->dai_link->name); @@ -1310,12 +1301,13 @@ static struct snd_soc_pcm_runtime *dpcm_get_be(struct snd_soc_card *card, struct snd_soc_dapm_widget *widget, int stream) { struct snd_soc_pcm_runtime *be; + struct snd_soc_dai *dai; int i; dev_dbg(card->dev, "ASoC: find BE for widget %s\n", widget->name); if (stream == SNDRV_PCM_STREAM_PLAYBACK) { - list_for_each_entry(be, &card->rtd_list, list) { + for_each_card_rtds(card, be) { if (!be->dai_link->no_pcm) continue; @@ -1327,15 +1319,14 @@ static struct snd_soc_pcm_runtime *dpcm_get_be(struct snd_soc_card *card, if (be->cpu_dai->playback_widget == widget) return be; - for (i = 0; i < be->num_codecs; i++) { - struct snd_soc_dai *dai = be->codec_dais[i]; + for_each_rtd_codec_dai(be, i, dai) { if (dai->playback_widget == widget) return be; } } } else { - list_for_each_entry(be, &card->rtd_list, list) { + for_each_card_rtds(card, be) { if (!be->dai_link->no_pcm) continue; @@ -1347,8 +1338,7 @@ static struct snd_soc_pcm_runtime *dpcm_get_be(struct snd_soc_card *card, if (be->cpu_dai->capture_widget == widget) return be; - for (i = 0; i < be->num_codecs; i++) { - struct snd_soc_dai *dai = be->codec_dais[i]; + for_each_rtd_codec_dai(be, i, dai) { if (dai->capture_widget == widget) return be; } @@ -1388,32 +1378,31 @@ static bool dpcm_end_walk_at_be(struct snd_soc_dapm_widget *widget, { struct snd_soc_card *card = widget->dapm->card; struct snd_soc_pcm_runtime *rtd; + struct snd_soc_dai *dai; int i; if (dir == SND_SOC_DAPM_DIR_OUT) { - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (!rtd->dai_link->no_pcm) continue; if (rtd->cpu_dai->playback_widget == widget) return true; - for (i = 0; i < rtd->num_codecs; ++i) { - struct snd_soc_dai *dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, dai) { if (dai->playback_widget == widget) return true; } } } else { /* SND_SOC_DAPM_DIR_IN */ - list_for_each_entry(rtd, &card->rtd_list, list) { + for_each_card_rtds(card, rtd) { if (!rtd->dai_link->no_pcm) continue; if (rtd->cpu_dai->capture_widget == widget) return true; - for (i = 0; i < rtd->num_codecs; ++i) { - struct snd_soc_dai *dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, dai) { if (dai->capture_widget == widget) return true; } @@ -1445,10 +1434,11 @@ static int dpcm_prune_paths(struct snd_soc_pcm_runtime *fe, int stream, struct snd_soc_dpcm *dpcm; struct snd_soc_dapm_widget_list *list = *list_; struct snd_soc_dapm_widget *widget; + struct snd_soc_dai *dai; int prune = 0; /* Destroy any old FE <--> BE connections */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { unsigned int i; /* is there a valid CPU DAI widget for this BE */ @@ -1459,8 +1449,7 @@ static int dpcm_prune_paths(struct snd_soc_pcm_runtime *fe, int stream, continue; /* is there a valid CODEC DAI widget for this BE */ - for (i = 0; i < dpcm->be->num_codecs; i++) { - struct snd_soc_dai *dai = dpcm->be->codec_dais[i]; + for_each_rtd_codec_dai(dpcm->be, i, dai) { widget = dai_get_widget(dai, stream); /* prune the BE if it's no longer in our active list */ @@ -1555,7 +1544,7 @@ void dpcm_clear_pending_state(struct snd_soc_pcm_runtime *fe, int stream) { struct snd_soc_dpcm *dpcm; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) + for_each_dpcm_be(fe, stream, dpcm) dpcm->be->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_NO; } @@ -1566,7 +1555,7 @@ static void dpcm_be_dai_startup_unwind(struct snd_soc_pcm_runtime *fe, struct snd_soc_dpcm *dpcm; /* disable any enabled and non active backends */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -1595,7 +1584,7 @@ int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream) int err, count = 0; /* only startup BE DAIs that are either sinks or sources to this FE DAI */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -1649,7 +1638,7 @@ int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream) unwind: /* disable any enabled and non active backends */ - list_for_each_entry_continue_reverse(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be_rollback(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = snd_soc_dpcm_get_substream(be, stream); @@ -1680,7 +1669,7 @@ static void dpcm_init_runtime_hw(struct snd_pcm_runtime *runtime, struct snd_soc_pcm_stream *stream) { runtime->hw.rate_min = stream->rate_min; - runtime->hw.rate_max = stream->rate_max; + runtime->hw.rate_max = min_not_zero(stream->rate_max, UINT_MAX); runtime->hw.channels_min = stream->channels_min; runtime->hw.channels_max = stream->channels_max; if (runtime->hw.formats) @@ -1695,6 +1684,7 @@ static void dpcm_runtime_merge_format(struct snd_pcm_substream *substream, { struct snd_soc_pcm_runtime *fe = substream->private_data; struct snd_soc_dpcm *dpcm; + struct snd_soc_dai *dai; int stream = substream->stream; if (!fe->dai_link->dpcm_merged_format) @@ -1705,22 +1695,21 @@ static void dpcm_runtime_merge_format(struct snd_pcm_substream *substream, * if FE want to use it (= dpcm_merged_format) */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_soc_dai_driver *codec_dai_drv; struct snd_soc_pcm_stream *codec_stream; int i; - for (i = 0; i < be->num_codecs; i++) { + for_each_rtd_codec_dai(be, i, dai) { /* * Skip CODECs which don't support the current stream * type. See soc_pcm_init_runtime_hw() for more details */ - if (!snd_soc_dai_stream_valid(be->codec_dais[i], - stream)) + if (!snd_soc_dai_stream_valid(dai, stream)) continue; - codec_dai_drv = be->codec_dais[i]->driver; + codec_dai_drv = dai->driver; if (stream == SNDRV_PCM_STREAM_PLAYBACK) codec_stream = &codec_dai_drv->playback; else @@ -1747,7 +1736,7 @@ static void dpcm_runtime_merge_chan(struct snd_pcm_substream *substream, * if FE want to use it (= dpcm_merged_chan) */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_soc_dai_driver *cpu_dai_drv = be->cpu_dai->driver; struct snd_soc_dai_driver *codec_dai_drv; @@ -1799,12 +1788,13 @@ static void dpcm_runtime_merge_rate(struct snd_pcm_substream *substream, * if FE want to use it (= dpcm_merged_chan) */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_soc_dai_driver *cpu_dai_drv = be->cpu_dai->driver; struct snd_soc_dai_driver *codec_dai_drv; struct snd_soc_pcm_stream *codec_stream; struct snd_soc_pcm_stream *cpu_stream; + struct snd_soc_dai *dai; int i; if (stream == SNDRV_PCM_STREAM_PLAYBACK) @@ -1816,16 +1806,15 @@ static void dpcm_runtime_merge_rate(struct snd_pcm_substream *substream, *rate_max = min_not_zero(*rate_max, cpu_stream->rate_max); *rates = snd_pcm_rate_mask_intersect(*rates, cpu_stream->rates); - for (i = 0; i < be->num_codecs; i++) { + for_each_rtd_codec_dai(be, i, dai) { /* * Skip CODECs which don't support the current stream * type. See soc_pcm_init_runtime_hw() for more details */ - if (!snd_soc_dai_stream_valid(be->codec_dais[i], - stream)) + if (!snd_soc_dai_stream_valid(dai, stream)) continue; - codec_dai_drv = be->codec_dais[i]->driver; + codec_dai_drv = dai->driver; if (stream == SNDRV_PCM_STREAM_PLAYBACK) codec_stream = &codec_dai_drv->playback; else @@ -1902,11 +1891,12 @@ static int dpcm_apply_symmetry(struct snd_pcm_substream *fe_substream, } /* apply symmetry for BE */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = snd_soc_dpcm_get_substream(be, stream); struct snd_soc_pcm_runtime *rtd = be_substream->private_data; + struct snd_soc_dai *codec_dai; int i; if (rtd->dai_link->be_hw_params_fixup) @@ -1923,10 +1913,10 @@ static int dpcm_apply_symmetry(struct snd_pcm_substream *fe_substream, return err; } - for (i = 0; i < rtd->num_codecs; i++) { - if (rtd->codec_dais[i]->active) { + for_each_rtd_codec_dai(rtd, i, codec_dai) { + if (codec_dai->active) { err = soc_pcm_apply_symmetry(fe_substream, - rtd->codec_dais[i]); + codec_dai); if (err < 0) return err; } @@ -1986,7 +1976,7 @@ int dpcm_be_dai_shutdown(struct snd_soc_pcm_runtime *fe, int stream) struct snd_soc_dpcm *dpcm; /* only shutdown BEs that are either sinks or sources to this FE DAI */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -2050,7 +2040,7 @@ int dpcm_be_dai_hw_free(struct snd_soc_pcm_runtime *fe, int stream) /* only hw_params backends that are either sinks or sources * to this frontend DAI */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -2119,7 +2109,7 @@ int dpcm_be_dai_hw_params(struct snd_soc_pcm_runtime *fe, int stream) struct snd_soc_dpcm *dpcm; int ret; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -2170,7 +2160,7 @@ int dpcm_be_dai_hw_params(struct snd_soc_pcm_runtime *fe, int stream) unwind: /* disable any enabled and non active backends */ - list_for_each_entry_continue_reverse(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be_rollback(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = snd_soc_dpcm_get_substream(be, stream); @@ -2250,7 +2240,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream, struct snd_soc_dpcm *dpcm; int ret = 0; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -2436,7 +2426,7 @@ int dpcm_be_dai_prepare(struct snd_soc_pcm_runtime *fe, int stream) struct snd_soc_dpcm *dpcm; int ret = 0; - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; struct snd_pcm_substream *be_substream = @@ -2646,7 +2636,7 @@ close: dpcm_be_dai_shutdown(fe, stream); disconnect: /* disconnect any non started BEs */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START) dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE; @@ -2771,14 +2761,14 @@ int soc_dpcm_runtime_update(struct snd_soc_card *card) mutex_lock_nested(&card->mutex, SND_SOC_CARD_CLASS_RUNTIME); /* shutdown all old paths first */ - list_for_each_entry(fe, &card->rtd_list, list) { + for_each_card_rtds(card, fe) { ret = soc_dpcm_fe_runtime_update(fe, 0); if (ret) goto out; } /* bring new paths up */ - list_for_each_entry(fe, &card->rtd_list, list) { + for_each_card_rtds(card, fe) { ret = soc_dpcm_fe_runtime_update(fe, 1); if (ret) goto out; @@ -2791,10 +2781,9 @@ out: int soc_dpcm_be_digital_mute(struct snd_soc_pcm_runtime *fe, int mute) { struct snd_soc_dpcm *dpcm; - struct list_head *clients = - &fe->dpcm[SNDRV_PCM_STREAM_PLAYBACK].be_clients; + struct snd_soc_dai *dai; - list_for_each_entry(dpcm, clients, list_be) { + for_each_dpcm_be(fe, SNDRV_PCM_STREAM_PLAYBACK, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; int i; @@ -2802,8 +2791,7 @@ int soc_dpcm_be_digital_mute(struct snd_soc_pcm_runtime *fe, int mute) if (be->dai_link->ignore_suspend) continue; - for (i = 0; i < be->num_codecs; i++) { - struct snd_soc_dai *dai = be->codec_dais[i]; + for_each_rtd_codec_dai(be, i, dai) { struct snd_soc_dai_driver *drv = dai->driver; dev_dbg(be->dev, "ASoC: BE digital mute %s\n", @@ -2844,7 +2832,7 @@ static int dpcm_fe_dai_open(struct snd_pcm_substream *fe_substream) ret = dpcm_fe_dai_startup(fe_substream); if (ret < 0) { /* clean up all links */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) + for_each_dpcm_be(fe, stream, dpcm) dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE; dpcm_be_disconnect(fe, stream); @@ -2867,7 +2855,7 @@ static int dpcm_fe_dai_close(struct snd_pcm_substream *fe_substream) ret = dpcm_fe_dai_shutdown(fe_substream); /* mark FE's links ready to prune */ - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) + for_each_dpcm_be(fe, stream, dpcm) dpcm->state = SND_SOC_DPCM_LINK_STATE_FREE; dpcm_be_disconnect(fe, stream); @@ -3041,8 +3029,7 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num) playback = rtd->dai_link->dpcm_playback; capture = rtd->dai_link->dpcm_capture; } else { - for (i = 0; i < rtd->num_codecs; i++) { - codec_dai = rtd->codec_dais[i]; + for_each_rtd_codec_dai(rtd, i, codec_dai) { if (codec_dai->driver->playback.channels_min) playback = 1; if (codec_dai->driver->capture.channels_min) @@ -3230,7 +3217,7 @@ int snd_soc_dpcm_can_be_free_stop(struct snd_soc_pcm_runtime *fe, struct snd_soc_dpcm *dpcm; int state; - list_for_each_entry(dpcm, &be->dpcm[stream].fe_clients, list_fe) { + for_each_dpcm_fe(be, stream, dpcm) { if (dpcm->fe == fe) continue; @@ -3257,7 +3244,7 @@ int snd_soc_dpcm_can_be_params(struct snd_soc_pcm_runtime *fe, struct snd_soc_dpcm *dpcm; int state; - list_for_each_entry(dpcm, &be->dpcm[stream].fe_clients, list_fe) { + for_each_dpcm_fe(be, stream, dpcm) { if (dpcm->fe == fe) continue; @@ -3337,7 +3324,7 @@ static ssize_t dpcm_show_state(struct snd_soc_pcm_runtime *fe, goto out; } - list_for_each_entry(dpcm, &fe->dpcm[stream].be_clients, list_be) { + for_each_dpcm_be(fe, stream, dpcm) { struct snd_soc_pcm_runtime *be = dpcm->be; params = &dpcm->hw_params; diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 66e77e020745..045ef136903d 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -993,7 +993,7 @@ static int soc_tplg_denum_create(struct soc_tplg *tplg, unsigned int count, kfree(se); continue; } - /* fall through and create texts */ + /* fall through */ case SND_SOC_TPLG_CTL_ENUM: case SND_SOC_TPLG_DAPM_CTL_ENUM_DOUBLE: case SND_SOC_TPLG_DAPM_CTL_ENUM_VIRT: @@ -1310,7 +1310,7 @@ static struct snd_kcontrol_new *soc_tplg_dapm_widget_denum_create( ec->hdr.name); goto err_se; } - /* fall through to create texts */ + /* fall through */ case SND_SOC_TPLG_CTL_ENUM: case SND_SOC_TPLG_DAPM_CTL_ENUM_DOUBLE: case SND_SOC_TPLG_DAPM_CTL_ENUM_VIRT: @@ -1565,17 +1565,6 @@ widget: widget = snd_soc_dapm_new_control_unlocked(dapm, &template); if (IS_ERR(widget)) { ret = PTR_ERR(widget); - /* Do not nag about probe deferrals */ - if (ret != -EPROBE_DEFER) - dev_err(tplg->dev, - "ASoC: failed to create widget %s controls (%d)\n", - w->name, ret); - goto hdr_err; - } - if (widget == NULL) { - dev_err(tplg->dev, "ASoC: failed to create widget %s controls\n", - w->name); - ret = -ENOMEM; goto hdr_err; } diff --git a/sound/soc/soc-utils.c b/sound/soc/soc-utils.c index e0c93496c0cd..e3b9dd634c6d 100644 --- a/sound/soc/soc-utils.c +++ b/sound/soc/soc-utils.c @@ -273,13 +273,13 @@ static int dummy_dma_open(struct snd_pcm_substream *substream) return 0; } -static const struct snd_pcm_ops dummy_dma_ops = { +static const struct snd_pcm_ops snd_dummy_dma_ops = { .open = dummy_dma_open, .ioctl = snd_pcm_lib_ioctl, }; static const struct snd_soc_component_driver dummy_platform = { - .ops = &dummy_dma_ops, + .ops = &snd_dummy_dma_ops, }; static const struct snd_soc_component_driver dummy_codec = { diff --git a/sound/soc/stm/Kconfig b/sound/soc/stm/Kconfig index 9b2681397dba..c66ffa72057e 100644 --- a/sound/soc/stm/Kconfig +++ b/sound/soc/stm/Kconfig @@ -3,6 +3,7 @@ menu "STMicroelectronics STM32 SOC audio support" config SND_SOC_STM32_SAI tristate "STM32 SAI interface (Serial Audio Interface) support" depends on (ARCH_STM32 && OF) || COMPILE_TEST + depends on COMMON_CLK depends on SND_SOC select SND_SOC_GENERIC_DMAENGINE_PCM select REGMAP_MMIO diff --git a/sound/soc/stm/stm32_sai.c b/sound/soc/stm/stm32_sai.c index f22654253c43..d597eba61992 100644 --- a/sound/soc/stm/stm32_sai.c +++ b/sound/soc/stm/stm32_sai.c @@ -104,7 +104,7 @@ static int stm32_sai_set_sync(struct stm32_sai_data *sai_client, if (!pdev) { dev_err(&sai_client->pdev->dev, - "Device not found for node %s\n", np_provider->name); + "Device not found for node %pOFn\n", np_provider); return -ENODEV; } diff --git a/sound/soc/stm/stm32_sai.h b/sound/soc/stm/stm32_sai.h index f25422174909..08de899c766b 100644 --- a/sound/soc/stm/stm32_sai.h +++ b/sound/soc/stm/stm32_sai.h @@ -91,6 +91,9 @@ #define SAI_XCR1_OSR_SHIFT 26 #define SAI_XCR1_OSR BIT(SAI_XCR1_OSR_SHIFT) +#define SAI_XCR1_MCKEN_SHIFT 27 +#define SAI_XCR1_MCKEN BIT(SAI_XCR1_MCKEN_SHIFT) + /******************* Bit definition for SAI_XCR2 register *******************/ #define SAI_XCR2_FTH_SHIFT 0 #define SAI_XCR2_FTH_MASK GENMASK(2, SAI_XCR2_FTH_SHIFT) diff --git a/sound/soc/stm/stm32_sai_sub.c b/sound/soc/stm/stm32_sai_sub.c index 06fba9650ac4..ea05cc91aa05 100644 --- a/sound/soc/stm/stm32_sai_sub.c +++ b/sound/soc/stm/stm32_sai_sub.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -68,6 +69,8 @@ #define SAI_IEC60958_BLOCK_FRAMES 192 #define SAI_IEC60958_STATUS_BYTES 24 +#define SAI_MCLK_NAME_LEN 32 + /** * struct stm32_sai_sub_data - private data of SAI sub block (block A or B) * @pdev: device data pointer @@ -80,6 +83,7 @@ * @pdata: SAI block parent data pointer * @np_sync_provider: synchronization provider node * @sai_ck: kernel clock feeding the SAI clock generator + * @sai_mclk: master clock from SAI mclk provider * @phys_addr: SAI registers physical base address * @mclk_rate: SAI block master clock frequency (Hz). set at init * @id: SAI sub block id corresponding to sub-block A or B @@ -110,6 +114,7 @@ struct stm32_sai_sub_data { struct stm32_sai_data *pdata; struct device_node *np_sync_provider; struct clk *sai_ck; + struct clk *sai_mclk; dma_addr_t phys_addr; unsigned int mclk_rate; unsigned int id; @@ -251,6 +256,176 @@ static const struct snd_kcontrol_new iec958_ctls = { .put = snd_pcm_iec958_put, }; +struct stm32_sai_mclk_data { + struct clk_hw hw; + unsigned long freq; + struct stm32_sai_sub_data *sai_data; +}; + +#define to_mclk_data(_hw) container_of(_hw, struct stm32_sai_mclk_data, hw) +#define STM32_SAI_MAX_CLKS 1 + +static int stm32_sai_get_clk_div(struct stm32_sai_sub_data *sai, + unsigned long input_rate, + unsigned long output_rate) +{ + int version = sai->pdata->conf->version; + int div; + + div = DIV_ROUND_CLOSEST(input_rate, output_rate); + if (div > SAI_XCR1_MCKDIV_MAX(version)) { + dev_err(&sai->pdev->dev, "Divider %d out of range\n", div); + return -EINVAL; + } + dev_dbg(&sai->pdev->dev, "SAI divider %d\n", div); + + if (input_rate % div) + dev_dbg(&sai->pdev->dev, + "Rate not accurate. requested (%ld), actual (%ld)\n", + output_rate, input_rate / div); + + return div; +} + +static int stm32_sai_set_clk_div(struct stm32_sai_sub_data *sai, + unsigned int div) +{ + int version = sai->pdata->conf->version; + int ret, cr1, mask; + + if (div > SAI_XCR1_MCKDIV_MAX(version)) { + dev_err(&sai->pdev->dev, "Divider %d out of range\n", div); + return -EINVAL; + } + + mask = SAI_XCR1_MCKDIV_MASK(SAI_XCR1_MCKDIV_WIDTH(version)); + cr1 = SAI_XCR1_MCKDIV_SET(div); + ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, mask, cr1); + if (ret < 0) + dev_err(&sai->pdev->dev, "Failed to update CR1 register\n"); + + return ret; +} + +static long stm32_sai_mclk_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *prate) +{ + struct stm32_sai_mclk_data *mclk = to_mclk_data(hw); + struct stm32_sai_sub_data *sai = mclk->sai_data; + int div; + + div = stm32_sai_get_clk_div(sai, *prate, rate); + if (div < 0) + return div; + + mclk->freq = *prate / div; + + return mclk->freq; +} + +static unsigned long stm32_sai_mclk_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct stm32_sai_mclk_data *mclk = to_mclk_data(hw); + + return mclk->freq; +} + +static int stm32_sai_mclk_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct stm32_sai_mclk_data *mclk = to_mclk_data(hw); + struct stm32_sai_sub_data *sai = mclk->sai_data; + unsigned int div; + int ret; + + div = stm32_sai_get_clk_div(sai, parent_rate, rate); + if (div < 0) + return div; + + ret = stm32_sai_set_clk_div(sai, div); + if (ret) + return ret; + + mclk->freq = rate; + + return 0; +} + +static int stm32_sai_mclk_enable(struct clk_hw *hw) +{ + struct stm32_sai_mclk_data *mclk = to_mclk_data(hw); + struct stm32_sai_sub_data *sai = mclk->sai_data; + + dev_dbg(&sai->pdev->dev, "Enable master clock\n"); + + return regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, + SAI_XCR1_MCKEN, SAI_XCR1_MCKEN); +} + +static void stm32_sai_mclk_disable(struct clk_hw *hw) +{ + struct stm32_sai_mclk_data *mclk = to_mclk_data(hw); + struct stm32_sai_sub_data *sai = mclk->sai_data; + + dev_dbg(&sai->pdev->dev, "Disable master clock\n"); + + regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, SAI_XCR1_MCKEN, 0); +} + +static const struct clk_ops mclk_ops = { + .enable = stm32_sai_mclk_enable, + .disable = stm32_sai_mclk_disable, + .recalc_rate = stm32_sai_mclk_recalc_rate, + .round_rate = stm32_sai_mclk_round_rate, + .set_rate = stm32_sai_mclk_set_rate, +}; + +static int stm32_sai_add_mclk_provider(struct stm32_sai_sub_data *sai) +{ + struct clk_hw *hw; + struct stm32_sai_mclk_data *mclk; + struct device *dev = &sai->pdev->dev; + const char *pname = __clk_get_name(sai->sai_ck); + char *mclk_name, *p, *s = (char *)pname; + int ret, i = 0; + + mclk = devm_kzalloc(dev, sizeof(mclk), GFP_KERNEL); + if (!mclk) + return -ENOMEM; + + mclk_name = devm_kcalloc(dev, sizeof(char), + SAI_MCLK_NAME_LEN, GFP_KERNEL); + if (!mclk_name) + return -ENOMEM; + + /* + * Forge mclk clock name from parent clock name and suffix. + * String after "_" char is stripped in parent name. + */ + p = mclk_name; + while (*s && *s != '_' && (i < (SAI_MCLK_NAME_LEN - 7))) { + *p++ = *s++; + i++; + } + STM_SAI_IS_SUB_A(sai) ? strcat(p, "a_mclk") : strcat(p, "b_mclk"); + + mclk->hw.init = CLK_HW_INIT(mclk_name, pname, &mclk_ops, 0); + mclk->sai_data = sai; + hw = &mclk->hw; + + dev_dbg(dev, "Register master clock %s\n", mclk_name); + ret = devm_clk_hw_register(&sai->pdev->dev, hw); + if (ret) { + dev_err(dev, "mclk register returned %d\n", ret); + return ret; + } + sai->sai_mclk = hw->clk; + + /* register mclk provider */ + return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, hw); +} + static irqreturn_t stm32_sai_isr(int irq, void *devid) { struct stm32_sai_sub_data *sai = (struct stm32_sai_sub_data *)devid; @@ -312,15 +487,25 @@ static int stm32_sai_set_sysclk(struct snd_soc_dai *cpu_dai, struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); int ret; - if ((dir == SND_SOC_CLOCK_OUT) && sai->master) { + if (dir == SND_SOC_CLOCK_OUT) { ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, SAI_XCR1_NODIV, (unsigned int)~SAI_XCR1_NODIV); if (ret < 0) return ret; - sai->mclk_rate = freq; dev_dbg(cpu_dai->dev, "SAI MCLK frequency is %uHz\n", freq); + sai->mclk_rate = freq; + + if (sai->sai_mclk) { + ret = clk_set_rate_exclusive(sai->sai_mclk, + sai->mclk_rate); + if (ret) { + dev_err(cpu_dai->dev, + "Could not set mclk rate\n"); + return ret; + } + } } return 0; @@ -715,15 +900,9 @@ static int stm32_sai_configure_clock(struct snd_soc_dai *cpu_dai, { struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); int cr1, mask, div = 0; - int sai_clk_rate, mclk_ratio, den, ret; - int version = sai->pdata->conf->version; + int sai_clk_rate, mclk_ratio, den; unsigned int rate = params_rate(params); - if (!sai->mclk_rate) { - dev_err(cpu_dai->dev, "Mclk rate is null\n"); - return -EINVAL; - } - if (!(rate % 11025)) clk_set_parent(sai->sai_ck, sai->pdata->clk_x11k); else @@ -731,14 +910,22 @@ static int stm32_sai_configure_clock(struct snd_soc_dai *cpu_dai, sai_clk_rate = clk_get_rate(sai->sai_ck); if (STM_SAI_IS_F4(sai->pdata)) { - /* - * mclk_rate = 256 * fs - * MCKDIV = 0 if sai_ck < 3/2 * mclk_rate - * MCKDIV = sai_ck / (2 * mclk_rate) otherwise + /* mclk on (NODIV=0) + * mclk_rate = 256 * fs + * MCKDIV = 0 if sai_ck < 3/2 * mclk_rate + * MCKDIV = sai_ck / (2 * mclk_rate) otherwise + * mclk off (NODIV=1) + * MCKDIV ignored. sck = sai_ck */ - if (2 * sai_clk_rate >= 3 * sai->mclk_rate) - div = DIV_ROUND_CLOSEST(sai_clk_rate, - 2 * sai->mclk_rate); + if (!sai->mclk_rate) + return 0; + + if (2 * sai_clk_rate >= 3 * sai->mclk_rate) { + div = stm32_sai_get_clk_div(sai, sai_clk_rate, + 2 * sai->mclk_rate); + if (div < 0) + return div; + } } else { /* * TDM mode : @@ -750,8 +937,10 @@ static int stm32_sai_configure_clock(struct snd_soc_dai *cpu_dai, * Note: NOMCK/NODIV correspond to same bit. */ if (STM_SAI_PROTOCOL_IS_SPDIF(sai)) { - div = DIV_ROUND_CLOSEST(sai_clk_rate, - (params_rate(params) * 128)); + div = stm32_sai_get_clk_div(sai, sai_clk_rate, + rate * 128); + if (div < 0) + return div; } else { if (sai->mclk_rate) { mclk_ratio = sai->mclk_rate / rate; @@ -764,31 +953,22 @@ static int stm32_sai_configure_clock(struct snd_soc_dai *cpu_dai, mclk_ratio); return -EINVAL; } - div = DIV_ROUND_CLOSEST(sai_clk_rate, - sai->mclk_rate); + div = stm32_sai_get_clk_div(sai, sai_clk_rate, + sai->mclk_rate); + if (div < 0) + return div; } else { /* mclk-fs not set, master clock not active */ den = sai->fs_length * params_rate(params); - div = DIV_ROUND_CLOSEST(sai_clk_rate, den); + div = stm32_sai_get_clk_div(sai, sai_clk_rate, + den); + if (div < 0) + return div; } } } - if (div > SAI_XCR1_MCKDIV_MAX(version)) { - dev_err(cpu_dai->dev, "Divider %d out of range\n", div); - return -EINVAL; - } - dev_dbg(cpu_dai->dev, "SAI clock %d, divider %d\n", sai_clk_rate, div); - - mask = SAI_XCR1_MCKDIV_MASK(SAI_XCR1_MCKDIV_WIDTH(version)); - cr1 = SAI_XCR1_MCKDIV_SET(div); - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, mask, cr1); - if (ret < 0) { - dev_err(cpu_dai->dev, "Failed to update CR1 register\n"); - return ret; - } - - return 0; + return stm32_sai_set_clk_div(sai, div); } static int stm32_sai_hw_params(struct snd_pcm_substream *substream, @@ -881,6 +1061,9 @@ static void stm32_sai_shutdown(struct snd_pcm_substream *substream, SAI_XCR1_NODIV); clk_disable_unprepare(sai->sai_ck); + + clk_rate_exclusive_put(sai->sai_mclk); + sai->substream = NULL; } @@ -903,6 +1086,8 @@ static int stm32_sai_dai_probe(struct snd_soc_dai *cpu_dai) struct stm32_sai_sub_data *sai = dev_get_drvdata(cpu_dai->dev); int cr1 = 0, cr1_mask; + sai->cpu_dai = cpu_dai; + sai->dma_params.addr = (dma_addr_t)(sai->phys_addr + STM_SAI_DR_REGX); /* * DMA supports 4, 8 or 16 burst sizes. Burst size 4 is the best choice, @@ -1124,16 +1309,15 @@ static int stm32_sai_sub_parse_of(struct platform_device *pdev, sai->sync = SAI_SYNC_NONE; if (args.np) { if (args.np == np) { - dev_err(&pdev->dev, "%s sync own reference\n", - np->name); + dev_err(&pdev->dev, "%pOFn sync own reference\n", np); of_node_put(args.np); return -EINVAL; } sai->np_sync_provider = of_get_parent(args.np); if (!sai->np_sync_provider) { - dev_err(&pdev->dev, "%s parent node not found\n", - np->name); + dev_err(&pdev->dev, "%pOFn parent node not found\n", + np); of_node_put(args.np); return -ENODEV; } @@ -1182,6 +1366,23 @@ static int stm32_sai_sub_parse_of(struct platform_device *pdev, return PTR_ERR(sai->sai_ck); } + if (STM_SAI_IS_F4(sai->pdata)) + return 0; + + /* Register mclk provider if requested */ + if (of_find_property(np, "#clock-cells", NULL)) { + ret = stm32_sai_add_mclk_provider(sai); + if (ret < 0) + return ret; + } else { + sai->sai_mclk = devm_clk_get(&pdev->dev, "MCLK"); + if (IS_ERR(sai->sai_mclk)) { + if (PTR_ERR(sai->sai_mclk) != -ENOENT) + return PTR_ERR(sai->sai_mclk); + sai->sai_mclk = NULL; + } + } + return 0; } diff --git a/sound/soc/sunxi/Kconfig b/sound/soc/sunxi/Kconfig index 22408bc2d6ec..66aad0d3f9c7 100644 --- a/sound/soc/sunxi/Kconfig +++ b/sound/soc/sunxi/Kconfig @@ -12,7 +12,7 @@ config SND_SUN4I_CODEC config SND_SUN8I_CODEC tristate "Allwinner SUN8I audio codec" depends on OF - depends on MACH_SUN8I || COMPILE_TEST + depends on MACH_SUN8I || (ARM64 && ARCH_SUNXI) || COMPILE_TEST select REGMAP_MMIO help This option enables the digital part of the internal audio codec for @@ -23,11 +23,19 @@ config SND_SUN8I_CODEC config SND_SUN8I_CODEC_ANALOG tristate "Allwinner sun8i Codec Analog Controls Support" depends on MACH_SUN8I || (ARM64 && ARCH_SUNXI) || COMPILE_TEST - select REGMAP + select SND_SUN8I_ADDA_PR_REGMAP help Say Y or M if you want to add support for the analog controls for the codec embedded in newer Allwinner SoCs. +config SND_SUN50I_CODEC_ANALOG + tristate "Allwinner sun50i Codec Analog Controls Support" + depends on (ARM64 && ARCH_SUNXI) || COMPILE_TEST + select SND_SUNXI_ADDA_PR_REGMAP + help + Say Y or M if you want to add support for the analog controls for + the codec embedded in Allwinner A64 SoC. + config SND_SUN4I_I2S tristate "Allwinner A10 I2S Support" select SND_SOC_GENERIC_DMAENGINE_PCM @@ -45,4 +53,9 @@ config SND_SUN4I_SPDIF help Say Y or M to add support for the S/PDIF audio block in the Allwinner A10 and affiliated SoCs. + +config SND_SUN8I_ADDA_PR_REGMAP + tristate + select REGMAP + endmenu diff --git a/sound/soc/sunxi/Makefile b/sound/soc/sunxi/Makefile index 4a9ef67386ca..a86be340a076 100644 --- a/sound/soc/sunxi/Makefile +++ b/sound/soc/sunxi/Makefile @@ -3,4 +3,6 @@ obj-$(CONFIG_SND_SUN4I_CODEC) += sun4i-codec.o obj-$(CONFIG_SND_SUN4I_I2S) += sun4i-i2s.o obj-$(CONFIG_SND_SUN4I_SPDIF) += sun4i-spdif.o obj-$(CONFIG_SND_SUN8I_CODEC_ANALOG) += sun8i-codec-analog.o +obj-$(CONFIG_SND_SUN50I_CODEC_ANALOG) += sun50i-codec-analog.o obj-$(CONFIG_SND_SUN8I_CODEC) += sun8i-codec.o +obj-$(CONFIG_SND_SUN8I_ADDA_PR_REGMAP) += sun8i-adda-pr-regmap.o diff --git a/sound/soc/sunxi/sun4i-i2s.c b/sound/soc/sunxi/sun4i-i2s.c index a4aa931ebfae..d5ec1a20499d 100644 --- a/sound/soc/sunxi/sun4i-i2s.c +++ b/sound/soc/sunxi/sun4i-i2s.c @@ -644,40 +644,6 @@ static int sun4i_i2s_trigger(struct snd_pcm_substream *substream, int cmd, return 0; } -static int sun4i_i2s_startup(struct snd_pcm_substream *substream, - struct snd_soc_dai *dai) -{ - struct sun4i_i2s *i2s = snd_soc_dai_get_drvdata(dai); - - /* Enable the whole hardware block */ - regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, - SUN4I_I2S_CTRL_GL_EN, SUN4I_I2S_CTRL_GL_EN); - - /* Enable the first output line */ - regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, - SUN4I_I2S_CTRL_SDO_EN_MASK, - SUN4I_I2S_CTRL_SDO_EN(0)); - - - return clk_prepare_enable(i2s->mod_clk); -} - -static void sun4i_i2s_shutdown(struct snd_pcm_substream *substream, - struct snd_soc_dai *dai) -{ - struct sun4i_i2s *i2s = snd_soc_dai_get_drvdata(dai); - - clk_disable_unprepare(i2s->mod_clk); - - /* Disable our output lines */ - regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, - SUN4I_I2S_CTRL_SDO_EN_MASK, 0); - - /* Disable the whole hardware block */ - regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, - SUN4I_I2S_CTRL_GL_EN, 0); -} - static int sun4i_i2s_set_sysclk(struct snd_soc_dai *dai, int clk_id, unsigned int freq, int dir) { @@ -695,8 +661,6 @@ static const struct snd_soc_dai_ops sun4i_i2s_dai_ops = { .hw_params = sun4i_i2s_hw_params, .set_fmt = sun4i_i2s_set_fmt, .set_sysclk = sun4i_i2s_set_sysclk, - .shutdown = sun4i_i2s_shutdown, - .startup = sun4i_i2s_startup, .trigger = sun4i_i2s_trigger, }; @@ -869,6 +833,21 @@ static int sun4i_i2s_runtime_resume(struct device *dev) goto err_disable_clk; } + /* Enable the whole hardware block */ + regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, + SUN4I_I2S_CTRL_GL_EN, SUN4I_I2S_CTRL_GL_EN); + + /* Enable the first output line */ + regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, + SUN4I_I2S_CTRL_SDO_EN_MASK, + SUN4I_I2S_CTRL_SDO_EN(0)); + + ret = clk_prepare_enable(i2s->mod_clk); + if (ret) { + dev_err(dev, "Failed to enable module clock\n"); + goto err_disable_clk; + } + return 0; err_disable_clk: @@ -880,6 +859,16 @@ static int sun4i_i2s_runtime_suspend(struct device *dev) { struct sun4i_i2s *i2s = dev_get_drvdata(dev); + clk_disable_unprepare(i2s->mod_clk); + + /* Disable our output lines */ + regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, + SUN4I_I2S_CTRL_SDO_EN_MASK, 0); + + /* Disable the whole hardware block */ + regmap_update_bits(i2s->regmap, SUN4I_I2S_CTRL_REG, + SUN4I_I2S_CTRL_GL_EN, 0); + regcache_cache_only(i2s->regmap, true); clk_disable_unprepare(i2s->bus_clk); @@ -961,6 +950,23 @@ static const struct sun4i_i2s_quirks sun8i_h3_i2s_quirks = { .field_rxchansel = REG_FIELD(SUN8I_I2S_RX_CHAN_SEL_REG, 0, 2), }; +static const struct sun4i_i2s_quirks sun50i_a64_codec_i2s_quirks = { + .has_reset = true, + .reg_offset_txdata = SUN8I_I2S_FIFO_TX_REG, + .sun4i_i2s_regmap = &sun4i_i2s_regmap_config, + .has_slave_select_bit = true, + .field_clkdiv_mclk_en = REG_FIELD(SUN4I_I2S_CLK_DIV_REG, 7, 7), + .field_fmt_wss = REG_FIELD(SUN4I_I2S_FMT0_REG, 2, 3), + .field_fmt_sr = REG_FIELD(SUN4I_I2S_FMT0_REG, 4, 5), + .field_fmt_bclk = REG_FIELD(SUN4I_I2S_FMT0_REG, 6, 6), + .field_fmt_lrclk = REG_FIELD(SUN4I_I2S_FMT0_REG, 7, 7), + .field_fmt_mode = REG_FIELD(SUN4I_I2S_FMT0_REG, 0, 1), + .field_txchanmap = REG_FIELD(SUN4I_I2S_TX_CHAN_MAP_REG, 0, 31), + .field_rxchanmap = REG_FIELD(SUN4I_I2S_RX_CHAN_MAP_REG, 0, 31), + .field_txchansel = REG_FIELD(SUN4I_I2S_TX_CHAN_SEL_REG, 0, 2), + .field_rxchansel = REG_FIELD(SUN4I_I2S_RX_CHAN_SEL_REG, 0, 2), +}; + static int sun4i_i2s_init_regmap_fields(struct device *dev, struct sun4i_i2s *i2s) { @@ -1169,6 +1175,10 @@ static const struct of_device_id sun4i_i2s_match[] = { .compatible = "allwinner,sun8i-h3-i2s", .data = &sun8i_h3_i2s_quirks, }, + { + .compatible = "allwinner,sun50i-a64-codec-i2s", + .data = &sun50i_a64_codec_i2s_quirks, + }, {} }; MODULE_DEVICE_TABLE(of, sun4i_i2s_match); diff --git a/sound/soc/sunxi/sun50i-codec-analog.c b/sound/soc/sunxi/sun50i-codec-analog.c new file mode 100644 index 000000000000..8f5f999df631 --- /dev/null +++ b/sound/soc/sunxi/sun50i-codec-analog.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * This driver supports the analog controls for the internal codec + * found in Allwinner's A64 SoC. + * + * Copyright (C) 2016 Chen-Yu Tsai + * Copyright (C) 2017 Marcus Cooper + * Copyright (C) 2018 Vasily Khoruzhick + * + * Based on sun8i-codec-analog.c + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "sun8i-adda-pr-regmap.h" + +/* Codec analog control register offsets and bit fields */ +#define SUN50I_ADDA_HP_CTRL 0x00 +#define SUN50I_ADDA_HP_CTRL_PA_CLK_GATE 7 +#define SUN50I_ADDA_HP_CTRL_HPPA_EN 6 +#define SUN50I_ADDA_HP_CTRL_HPVOL 0 + +#define SUN50I_ADDA_OL_MIX_CTRL 0x01 +#define SUN50I_ADDA_OL_MIX_CTRL_MIC1 6 +#define SUN50I_ADDA_OL_MIX_CTRL_MIC2 5 +#define SUN50I_ADDA_OL_MIX_CTRL_PHONE 4 +#define SUN50I_ADDA_OL_MIX_CTRL_PHONEN 3 +#define SUN50I_ADDA_OL_MIX_CTRL_LINEINL 2 +#define SUN50I_ADDA_OL_MIX_CTRL_DACL 1 +#define SUN50I_ADDA_OL_MIX_CTRL_DACR 0 + +#define SUN50I_ADDA_OR_MIX_CTRL 0x02 +#define SUN50I_ADDA_OR_MIX_CTRL_MIC1 6 +#define SUN50I_ADDA_OR_MIX_CTRL_MIC2 5 +#define SUN50I_ADDA_OR_MIX_CTRL_PHONE 4 +#define SUN50I_ADDA_OR_MIX_CTRL_PHONEP 3 +#define SUN50I_ADDA_OR_MIX_CTRL_LINEINR 2 +#define SUN50I_ADDA_OR_MIX_CTRL_DACR 1 +#define SUN50I_ADDA_OR_MIX_CTRL_DACL 0 + +#define SUN50I_ADDA_LINEOUT_CTRL0 0x05 +#define SUN50I_ADDA_LINEOUT_CTRL0_LEN 7 +#define SUN50I_ADDA_LINEOUT_CTRL0_REN 6 +#define SUN50I_ADDA_LINEOUT_CTRL0_LSRC_SEL 5 +#define SUN50I_ADDA_LINEOUT_CTRL0_RSRC_SEL 4 + +#define SUN50I_ADDA_LINEOUT_CTRL1 0x06 +#define SUN50I_ADDA_LINEOUT_CTRL1_VOL 0 + +#define SUN50I_ADDA_MIC1_CTRL 0x07 +#define SUN50I_ADDA_MIC1_CTRL_MIC1G 4 +#define SUN50I_ADDA_MIC1_CTRL_MIC1AMPEN 3 +#define SUN50I_ADDA_MIC1_CTRL_MIC1BOOST 0 + +#define SUN50I_ADDA_MIC2_CTRL 0x08 +#define SUN50I_ADDA_MIC2_CTRL_MIC2G 4 +#define SUN50I_ADDA_MIC2_CTRL_MIC2AMPEN 3 +#define SUN50I_ADDA_MIC2_CTRL_MIC2BOOST 0 + +#define SUN50I_ADDA_LINEIN_CTRL 0x09 +#define SUN50I_ADDA_LINEIN_CTRL_LINEING 0 + +#define SUN50I_ADDA_MIX_DAC_CTRL 0x0a +#define SUN50I_ADDA_MIX_DAC_CTRL_DACAREN 7 +#define SUN50I_ADDA_MIX_DAC_CTRL_DACALEN 6 +#define SUN50I_ADDA_MIX_DAC_CTRL_RMIXEN 5 +#define SUN50I_ADDA_MIX_DAC_CTRL_LMIXEN 4 +#define SUN50I_ADDA_MIX_DAC_CTRL_RHPPAMUTE 3 +#define SUN50I_ADDA_MIX_DAC_CTRL_LHPPAMUTE 2 +#define SUN50I_ADDA_MIX_DAC_CTRL_RHPIS 1 +#define SUN50I_ADDA_MIX_DAC_CTRL_LHPIS 0 + +#define SUN50I_ADDA_L_ADCMIX_SRC 0x0b +#define SUN50I_ADDA_L_ADCMIX_SRC_MIC1 6 +#define SUN50I_ADDA_L_ADCMIX_SRC_MIC2 5 +#define SUN50I_ADDA_L_ADCMIX_SRC_PHONE 4 +#define SUN50I_ADDA_L_ADCMIX_SRC_PHONEN 3 +#define SUN50I_ADDA_L_ADCMIX_SRC_LINEINL 2 +#define SUN50I_ADDA_L_ADCMIX_SRC_OMIXRL 1 +#define SUN50I_ADDA_L_ADCMIX_SRC_OMIXRR 0 + +#define SUN50I_ADDA_R_ADCMIX_SRC 0x0c +#define SUN50I_ADDA_R_ADCMIX_SRC_MIC1 6 +#define SUN50I_ADDA_R_ADCMIX_SRC_MIC2 5 +#define SUN50I_ADDA_R_ADCMIX_SRC_PHONE 4 +#define SUN50I_ADDA_R_ADCMIX_SRC_PHONEP 3 +#define SUN50I_ADDA_R_ADCMIX_SRC_LINEINR 2 +#define SUN50I_ADDA_R_ADCMIX_SRC_OMIXR 1 +#define SUN50I_ADDA_R_ADCMIX_SRC_OMIXL 0 + +#define SUN50I_ADDA_ADC_CTRL 0x0d +#define SUN50I_ADDA_ADC_CTRL_ADCREN 7 +#define SUN50I_ADDA_ADC_CTRL_ADCLEN 6 +#define SUN50I_ADDA_ADC_CTRL_ADCG 0 + +#define SUN50I_ADDA_HS_MBIAS_CTRL 0x0e +#define SUN50I_ADDA_HS_MBIAS_CTRL_MMICBIASEN 7 + +#define SUN50I_ADDA_JACK_MIC_CTRL 0x1d +#define SUN50I_ADDA_JACK_MIC_CTRL_HMICBIASEN 5 + +/* mixer controls */ +static const struct snd_kcontrol_new sun50i_a64_codec_mixer_controls[] = { + SOC_DAPM_DOUBLE_R("DAC Playback Switch", + SUN50I_ADDA_OL_MIX_CTRL, + SUN50I_ADDA_OR_MIX_CTRL, + SUN50I_ADDA_OL_MIX_CTRL_DACL, 1, 0), + SOC_DAPM_DOUBLE_R("DAC Reversed Playback Switch", + SUN50I_ADDA_OL_MIX_CTRL, + SUN50I_ADDA_OR_MIX_CTRL, + SUN50I_ADDA_OL_MIX_CTRL_DACR, 1, 0), + SOC_DAPM_DOUBLE_R("Line In Playback Switch", + SUN50I_ADDA_OL_MIX_CTRL, + SUN50I_ADDA_OR_MIX_CTRL, + SUN50I_ADDA_OL_MIX_CTRL_LINEINL, 1, 0), + SOC_DAPM_DOUBLE_R("Mic1 Playback Switch", + SUN50I_ADDA_OL_MIX_CTRL, + SUN50I_ADDA_OR_MIX_CTRL, + SUN50I_ADDA_OL_MIX_CTRL_MIC1, 1, 0), + SOC_DAPM_DOUBLE_R("Mic2 Playback Switch", + SUN50I_ADDA_OL_MIX_CTRL, + SUN50I_ADDA_OR_MIX_CTRL, + SUN50I_ADDA_OL_MIX_CTRL_MIC2, 1, 0), +}; + +/* ADC mixer controls */ +static const struct snd_kcontrol_new sun50i_codec_adc_mixer_controls[] = { + SOC_DAPM_DOUBLE_R("Mixer Capture Switch", + SUN50I_ADDA_L_ADCMIX_SRC, + SUN50I_ADDA_R_ADCMIX_SRC, + SUN50I_ADDA_L_ADCMIX_SRC_OMIXRL, 1, 0), + SOC_DAPM_DOUBLE_R("Mixer Reversed Capture Switch", + SUN50I_ADDA_L_ADCMIX_SRC, + SUN50I_ADDA_R_ADCMIX_SRC, + SUN50I_ADDA_L_ADCMIX_SRC_OMIXRR, 1, 0), + SOC_DAPM_DOUBLE_R("Line In Capture Switch", + SUN50I_ADDA_L_ADCMIX_SRC, + SUN50I_ADDA_R_ADCMIX_SRC, + SUN50I_ADDA_L_ADCMIX_SRC_LINEINL, 1, 0), + SOC_DAPM_DOUBLE_R("Mic1 Capture Switch", + SUN50I_ADDA_L_ADCMIX_SRC, + SUN50I_ADDA_R_ADCMIX_SRC, + SUN50I_ADDA_L_ADCMIX_SRC_MIC1, 1, 0), + SOC_DAPM_DOUBLE_R("Mic2 Capture Switch", + SUN50I_ADDA_L_ADCMIX_SRC, + SUN50I_ADDA_R_ADCMIX_SRC, + SUN50I_ADDA_L_ADCMIX_SRC_MIC2, 1, 0), +}; + +static const DECLARE_TLV_DB_SCALE(sun50i_codec_out_mixer_pregain_scale, + -450, 150, 0); +static const DECLARE_TLV_DB_RANGE(sun50i_codec_mic_gain_scale, + 0, 0, TLV_DB_SCALE_ITEM(0, 0, 0), + 1, 7, TLV_DB_SCALE_ITEM(2400, 300, 0), +); + +static const DECLARE_TLV_DB_SCALE(sun50i_codec_hp_vol_scale, -6300, 100, 1); + +static const DECLARE_TLV_DB_RANGE(sun50i_codec_lineout_vol_scale, + 0, 1, TLV_DB_SCALE_ITEM(TLV_DB_GAIN_MUTE, 0, 1), + 2, 31, TLV_DB_SCALE_ITEM(-4350, 150, 0), +); + + +/* volume / mute controls */ +static const struct snd_kcontrol_new sun50i_a64_codec_controls[] = { + SOC_SINGLE_TLV("Headphone Playback Volume", + SUN50I_ADDA_HP_CTRL, + SUN50I_ADDA_HP_CTRL_HPVOL, 0x3f, 0, + sun50i_codec_hp_vol_scale), + + SOC_DOUBLE("Headphone Playback Switch", + SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_LHPPAMUTE, + SUN50I_ADDA_MIX_DAC_CTRL_RHPPAMUTE, 1, 0), + + /* Mixer pre-gain */ + SOC_SINGLE_TLV("Mic1 Playback Volume", SUN50I_ADDA_MIC1_CTRL, + SUN50I_ADDA_MIC1_CTRL_MIC1G, + 0x7, 0, sun50i_codec_out_mixer_pregain_scale), + + /* Microphone Amp boost gain */ + SOC_SINGLE_TLV("Mic1 Boost Volume", SUN50I_ADDA_MIC1_CTRL, + SUN50I_ADDA_MIC1_CTRL_MIC1BOOST, 0x7, 0, + sun50i_codec_mic_gain_scale), + + /* Mixer pre-gain */ + SOC_SINGLE_TLV("Mic2 Playback Volume", + SUN50I_ADDA_MIC2_CTRL, SUN50I_ADDA_MIC2_CTRL_MIC2G, + 0x7, 0, sun50i_codec_out_mixer_pregain_scale), + + /* Microphone Amp boost gain */ + SOC_SINGLE_TLV("Mic2 Boost Volume", SUN50I_ADDA_MIC2_CTRL, + SUN50I_ADDA_MIC2_CTRL_MIC2BOOST, 0x7, 0, + sun50i_codec_mic_gain_scale), + + /* ADC */ + SOC_SINGLE_TLV("ADC Gain Capture Volume", SUN50I_ADDA_ADC_CTRL, + SUN50I_ADDA_ADC_CTRL_ADCG, 0x7, 0, + sun50i_codec_out_mixer_pregain_scale), + + /* Mixer pre-gain */ + SOC_SINGLE_TLV("Line In Playback Volume", SUN50I_ADDA_LINEIN_CTRL, + SUN50I_ADDA_LINEIN_CTRL_LINEING, + 0x7, 0, sun50i_codec_out_mixer_pregain_scale), + + SOC_SINGLE_TLV("Line Out Playback Volume", + SUN50I_ADDA_LINEOUT_CTRL1, + SUN50I_ADDA_LINEOUT_CTRL1_VOL, 0x1f, 0, + sun50i_codec_lineout_vol_scale), + + SOC_DOUBLE("Line Out Playback Switch", + SUN50I_ADDA_LINEOUT_CTRL0, + SUN50I_ADDA_LINEOUT_CTRL0_LEN, + SUN50I_ADDA_LINEOUT_CTRL0_REN, 1, 0), + +}; + +static const char * const sun50i_codec_hp_src_enum_text[] = { + "DAC", "Mixer", +}; + +static SOC_ENUM_DOUBLE_DECL(sun50i_codec_hp_src_enum, + SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_LHPIS, + SUN50I_ADDA_MIX_DAC_CTRL_RHPIS, + sun50i_codec_hp_src_enum_text); + +static const struct snd_kcontrol_new sun50i_codec_hp_src[] = { + SOC_DAPM_ENUM("Headphone Source Playback Route", + sun50i_codec_hp_src_enum), +}; + +static const char * const sun50i_codec_lineout_src_enum_text[] = { + "Stereo", "Mono Differential", +}; + +static SOC_ENUM_DOUBLE_DECL(sun50i_codec_lineout_src_enum, + SUN50I_ADDA_LINEOUT_CTRL0, + SUN50I_ADDA_LINEOUT_CTRL0_LSRC_SEL, + SUN50I_ADDA_LINEOUT_CTRL0_RSRC_SEL, + sun50i_codec_lineout_src_enum_text); + +static const struct snd_kcontrol_new sun50i_codec_lineout_src[] = { + SOC_DAPM_ENUM("Line Out Source Playback Route", + sun50i_codec_lineout_src_enum), +}; + +static const struct snd_soc_dapm_widget sun50i_a64_codec_widgets[] = { + /* DAC */ + SND_SOC_DAPM_DAC("Left DAC", NULL, SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_DACALEN, 0), + SND_SOC_DAPM_DAC("Right DAC", NULL, SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_DACAREN, 0), + /* ADC */ + SND_SOC_DAPM_ADC("Left ADC", NULL, SUN50I_ADDA_ADC_CTRL, + SUN50I_ADDA_ADC_CTRL_ADCLEN, 0), + SND_SOC_DAPM_ADC("Right ADC", NULL, SUN50I_ADDA_ADC_CTRL, + SUN50I_ADDA_ADC_CTRL_ADCREN, 0), + /* + * Due to this component and the codec belonging to separate DAPM + * contexts, we need to manually link the above widgets to their + * stream widgets at the card level. + */ + + SND_SOC_DAPM_MUX("Headphone Source Playback Route", + SND_SOC_NOPM, 0, 0, sun50i_codec_hp_src), + SND_SOC_DAPM_OUT_DRV("Headphone Amp", SUN50I_ADDA_HP_CTRL, + SUN50I_ADDA_HP_CTRL_HPPA_EN, 0, NULL, 0), + SND_SOC_DAPM_OUTPUT("HP"), + + SND_SOC_DAPM_MUX("Line Out Source Playback Route", + SND_SOC_NOPM, 0, 0, sun50i_codec_lineout_src), + SND_SOC_DAPM_OUTPUT("LINEOUT"), + + /* Microphone inputs */ + SND_SOC_DAPM_INPUT("MIC1"), + + /* Microphone Bias */ + SND_SOC_DAPM_SUPPLY("MBIAS", SUN50I_ADDA_HS_MBIAS_CTRL, + SUN50I_ADDA_HS_MBIAS_CTRL_MMICBIASEN, + 0, NULL, 0), + + /* Mic input path */ + SND_SOC_DAPM_PGA("Mic1 Amplifier", SUN50I_ADDA_MIC1_CTRL, + SUN50I_ADDA_MIC1_CTRL_MIC1AMPEN, 0, NULL, 0), + + /* Microphone input */ + SND_SOC_DAPM_INPUT("MIC2"), + + /* Microphone Bias */ + SND_SOC_DAPM_SUPPLY("HBIAS", SUN50I_ADDA_JACK_MIC_CTRL, + SUN50I_ADDA_JACK_MIC_CTRL_HMICBIASEN, + 0, NULL, 0), + + /* Mic input path */ + SND_SOC_DAPM_PGA("Mic2 Amplifier", SUN50I_ADDA_MIC2_CTRL, + SUN50I_ADDA_MIC2_CTRL_MIC2AMPEN, 0, NULL, 0), + + /* Line input */ + SND_SOC_DAPM_INPUT("LINEIN"), + + /* Mixers */ + SND_SOC_DAPM_MIXER("Left Mixer", SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_LMIXEN, 0, + sun50i_a64_codec_mixer_controls, + ARRAY_SIZE(sun50i_a64_codec_mixer_controls)), + SND_SOC_DAPM_MIXER("Right Mixer", SUN50I_ADDA_MIX_DAC_CTRL, + SUN50I_ADDA_MIX_DAC_CTRL_RMIXEN, 0, + sun50i_a64_codec_mixer_controls, + ARRAY_SIZE(sun50i_a64_codec_mixer_controls)), + SND_SOC_DAPM_MIXER("Left ADC Mixer", SUN50I_ADDA_ADC_CTRL, + SUN50I_ADDA_ADC_CTRL_ADCLEN, 0, + sun50i_codec_adc_mixer_controls, + ARRAY_SIZE(sun50i_codec_adc_mixer_controls)), + SND_SOC_DAPM_MIXER("Right ADC Mixer", SUN50I_ADDA_ADC_CTRL, + SUN50I_ADDA_ADC_CTRL_ADCREN, 0, + sun50i_codec_adc_mixer_controls, + ARRAY_SIZE(sun50i_codec_adc_mixer_controls)), +}; + +static const struct snd_soc_dapm_route sun50i_a64_codec_routes[] = { + /* Left Mixer Routes */ + { "Left Mixer", "DAC Playback Switch", "Left DAC" }, + { "Left Mixer", "DAC Reversed Playback Switch", "Right DAC" }, + { "Left Mixer", "Mic1 Playback Switch", "Mic1 Amplifier" }, + + /* Right Mixer Routes */ + { "Right Mixer", "DAC Playback Switch", "Right DAC" }, + { "Right Mixer", "DAC Reversed Playback Switch", "Left DAC" }, + { "Right Mixer", "Mic1 Playback Switch", "Mic1 Amplifier" }, + + /* Left ADC Mixer Routes */ + { "Left ADC Mixer", "Mixer Capture Switch", "Left Mixer" }, + { "Left ADC Mixer", "Mixer Reversed Capture Switch", "Right Mixer" }, + { "Left ADC Mixer", "Mic1 Capture Switch", "Mic1 Amplifier" }, + + /* Right ADC Mixer Routes */ + { "Right ADC Mixer", "Mixer Capture Switch", "Right Mixer" }, + { "Right ADC Mixer", "Mixer Reversed Capture Switch", "Left Mixer" }, + { "Right ADC Mixer", "Mic1 Capture Switch", "Mic1 Amplifier" }, + + /* ADC Routes */ + { "Left ADC", NULL, "Left ADC Mixer" }, + { "Right ADC", NULL, "Right ADC Mixer" }, + + /* Headphone Routes */ + { "Headphone Source Playback Route", "DAC", "Left DAC" }, + { "Headphone Source Playback Route", "DAC", "Right DAC" }, + { "Headphone Source Playback Route", "Mixer", "Left Mixer" }, + { "Headphone Source Playback Route", "Mixer", "Right Mixer" }, + { "Headphone Amp", NULL, "Headphone Source Playback Route" }, + { "HP", NULL, "Headphone Amp" }, + + /* Microphone Routes */ + { "Mic1 Amplifier", NULL, "MIC1"}, + + /* Microphone Routes */ + { "Mic2 Amplifier", NULL, "MIC2"}, + { "Left Mixer", "Mic2 Playback Switch", "Mic2 Amplifier" }, + { "Right Mixer", "Mic2 Playback Switch", "Mic2 Amplifier" }, + { "Left ADC Mixer", "Mic2 Capture Switch", "Mic2 Amplifier" }, + { "Right ADC Mixer", "Mic2 Capture Switch", "Mic2 Amplifier" }, + + /* Line-in Routes */ + { "Left Mixer", "Line In Playback Switch", "LINEIN" }, + { "Right Mixer", "Line In Playback Switch", "LINEIN" }, + { "Left ADC Mixer", "Line In Capture Switch", "LINEIN" }, + { "Right ADC Mixer", "Line In Capture Switch", "LINEIN" }, + + /* Line-out Routes */ + { "Line Out Source Playback Route", "Stereo", "Left Mixer" }, + { "Line Out Source Playback Route", "Stereo", "Right Mixer" }, + { "Line Out Source Playback Route", "Mono Differential", "Left Mixer" }, + { "Line Out Source Playback Route", "Mono Differential", + "Right Mixer" }, + { "LINEOUT", NULL, "Line Out Source Playback Route" }, +}; + +static const struct snd_soc_component_driver sun50i_codec_analog_cmpnt_drv = { + .controls = sun50i_a64_codec_controls, + .num_controls = ARRAY_SIZE(sun50i_a64_codec_controls), + .dapm_widgets = sun50i_a64_codec_widgets, + .num_dapm_widgets = ARRAY_SIZE(sun50i_a64_codec_widgets), + .dapm_routes = sun50i_a64_codec_routes, + .num_dapm_routes = ARRAY_SIZE(sun50i_a64_codec_routes), +}; + +static const struct of_device_id sun50i_codec_analog_of_match[] = { + { + .compatible = "allwinner,sun50i-a64-codec-analog", + }, + {} +}; +MODULE_DEVICE_TABLE(of, sun50i_codec_analog_of_match); + +static int sun50i_codec_analog_probe(struct platform_device *pdev) +{ + struct resource *res; + struct regmap *regmap; + void __iomem *base; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) { + dev_err(&pdev->dev, "Failed to map the registers\n"); + return PTR_ERR(base); + } + + regmap = sun8i_adda_pr_regmap_init(&pdev->dev, base); + if (IS_ERR(regmap)) { + dev_err(&pdev->dev, "Failed to create regmap\n"); + return PTR_ERR(regmap); + } + + return devm_snd_soc_register_component(&pdev->dev, + &sun50i_codec_analog_cmpnt_drv, + NULL, 0); +} + +static struct platform_driver sun50i_codec_analog_driver = { + .driver = { + .name = "sun50i-codec-analog", + .of_match_table = sun50i_codec_analog_of_match, + }, + .probe = sun50i_codec_analog_probe, +}; +module_platform_driver(sun50i_codec_analog_driver); + +MODULE_DESCRIPTION("Allwinner internal codec analog controls driver for A64"); +MODULE_AUTHOR("Vasily Khoruzhick "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:sun50i-codec-analog"); diff --git a/sound/soc/sunxi/sun8i-adda-pr-regmap.c b/sound/soc/sunxi/sun8i-adda-pr-regmap.c new file mode 100644 index 000000000000..e68ce9d2884d --- /dev/null +++ b/sound/soc/sunxi/sun8i-adda-pr-regmap.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * This driver provides regmap to access to analog part of audio codec + * found on Allwinner A23, A31s, A33, H3 and A64 Socs + * + * Copyright 2016 Chen-Yu Tsai + * Copyright (C) 2018 Vasily Khoruzhick + */ + +#include +#include +#include +#include + +#include "sun8i-adda-pr-regmap.h" + +/* Analog control register access bits */ +#define ADDA_PR 0x0 /* PRCM base + 0x1c0 */ +#define ADDA_PR_RESET BIT(28) +#define ADDA_PR_WRITE BIT(24) +#define ADDA_PR_ADDR_SHIFT 16 +#define ADDA_PR_ADDR_MASK GENMASK(4, 0) +#define ADDA_PR_DATA_IN_SHIFT 8 +#define ADDA_PR_DATA_IN_MASK GENMASK(7, 0) +#define ADDA_PR_DATA_OUT_SHIFT 0 +#define ADDA_PR_DATA_OUT_MASK GENMASK(7, 0) + +/* regmap access bits */ +static int adda_reg_read(void *context, unsigned int reg, unsigned int *val) +{ + void __iomem *base = (void __iomem *)context; + u32 tmp; + + /* De-assert reset */ + writel(readl(base) | ADDA_PR_RESET, base); + + /* Clear write bit */ + writel(readl(base) & ~ADDA_PR_WRITE, base); + + /* Set register address */ + tmp = readl(base); + tmp &= ~(ADDA_PR_ADDR_MASK << ADDA_PR_ADDR_SHIFT); + tmp |= (reg & ADDA_PR_ADDR_MASK) << ADDA_PR_ADDR_SHIFT; + writel(tmp, base); + + /* Read back value */ + *val = readl(base) & ADDA_PR_DATA_OUT_MASK; + + return 0; +} + +static int adda_reg_write(void *context, unsigned int reg, unsigned int val) +{ + void __iomem *base = (void __iomem *)context; + u32 tmp; + + /* De-assert reset */ + writel(readl(base) | ADDA_PR_RESET, base); + + /* Set register address */ + tmp = readl(base); + tmp &= ~(ADDA_PR_ADDR_MASK << ADDA_PR_ADDR_SHIFT); + tmp |= (reg & ADDA_PR_ADDR_MASK) << ADDA_PR_ADDR_SHIFT; + writel(tmp, base); + + /* Set data to write */ + tmp = readl(base); + tmp &= ~(ADDA_PR_DATA_IN_MASK << ADDA_PR_DATA_IN_SHIFT); + tmp |= (val & ADDA_PR_DATA_IN_MASK) << ADDA_PR_DATA_IN_SHIFT; + writel(tmp, base); + + /* Set write bit to signal a write */ + writel(readl(base) | ADDA_PR_WRITE, base); + + /* Clear write bit */ + writel(readl(base) & ~ADDA_PR_WRITE, base); + + return 0; +} + +static const struct regmap_config adda_pr_regmap_cfg = { + .name = "adda-pr", + .reg_bits = 5, + .reg_stride = 1, + .val_bits = 8, + .reg_read = adda_reg_read, + .reg_write = adda_reg_write, + .fast_io = true, + .max_register = 31, +}; + +struct regmap *sun8i_adda_pr_regmap_init(struct device *dev, + void __iomem *base) +{ + return devm_regmap_init(dev, NULL, base, &adda_pr_regmap_cfg); +} +EXPORT_SYMBOL_GPL(sun8i_adda_pr_regmap_init); + +MODULE_DESCRIPTION("Allwinner analog audio codec regmap driver"); +MODULE_AUTHOR("Vasily Khoruzhick "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:sunxi-adda-pr"); diff --git a/sound/soc/sunxi/sun8i-adda-pr-regmap.h b/sound/soc/sunxi/sun8i-adda-pr-regmap.h new file mode 100644 index 000000000000..a5ae95dfebc1 --- /dev/null +++ b/sound/soc/sunxi/sun8i-adda-pr-regmap.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2018 Vasily Khoruzhick + */ + +struct regmap *sun8i_adda_pr_regmap_init(struct device *dev, + void __iomem *base); diff --git a/sound/soc/sunxi/sun8i-codec-analog.c b/sound/soc/sunxi/sun8i-codec-analog.c index 485e79f292c4..916a46bbc1c8 100644 --- a/sound/soc/sunxi/sun8i-codec-analog.c +++ b/sound/soc/sunxi/sun8i-codec-analog.c @@ -27,6 +27,8 @@ #include #include +#include "sun8i-adda-pr-regmap.h" + /* Codec analog control register offsets and bit fields */ #define SUN8I_ADDA_HP_VOLC 0x00 #define SUN8I_ADDA_HP_VOLC_PA_CLK_GATE 7 @@ -120,81 +122,6 @@ #define SUN8I_ADDA_ADC_AP_EN_ADCLEN 6 #define SUN8I_ADDA_ADC_AP_EN_ADCG 0 -/* Analog control register access bits */ -#define ADDA_PR 0x0 /* PRCM base + 0x1c0 */ -#define ADDA_PR_RESET BIT(28) -#define ADDA_PR_WRITE BIT(24) -#define ADDA_PR_ADDR_SHIFT 16 -#define ADDA_PR_ADDR_MASK GENMASK(4, 0) -#define ADDA_PR_DATA_IN_SHIFT 8 -#define ADDA_PR_DATA_IN_MASK GENMASK(7, 0) -#define ADDA_PR_DATA_OUT_SHIFT 0 -#define ADDA_PR_DATA_OUT_MASK GENMASK(7, 0) - -/* regmap access bits */ -static int adda_reg_read(void *context, unsigned int reg, unsigned int *val) -{ - void __iomem *base = (void __iomem *)context; - u32 tmp; - - /* De-assert reset */ - writel(readl(base) | ADDA_PR_RESET, base); - - /* Clear write bit */ - writel(readl(base) & ~ADDA_PR_WRITE, base); - - /* Set register address */ - tmp = readl(base); - tmp &= ~(ADDA_PR_ADDR_MASK << ADDA_PR_ADDR_SHIFT); - tmp |= (reg & ADDA_PR_ADDR_MASK) << ADDA_PR_ADDR_SHIFT; - writel(tmp, base); - - /* Read back value */ - *val = readl(base) & ADDA_PR_DATA_OUT_MASK; - - return 0; -} - -static int adda_reg_write(void *context, unsigned int reg, unsigned int val) -{ - void __iomem *base = (void __iomem *)context; - u32 tmp; - - /* De-assert reset */ - writel(readl(base) | ADDA_PR_RESET, base); - - /* Set register address */ - tmp = readl(base); - tmp &= ~(ADDA_PR_ADDR_MASK << ADDA_PR_ADDR_SHIFT); - tmp |= (reg & ADDA_PR_ADDR_MASK) << ADDA_PR_ADDR_SHIFT; - writel(tmp, base); - - /* Set data to write */ - tmp = readl(base); - tmp &= ~(ADDA_PR_DATA_IN_MASK << ADDA_PR_DATA_IN_SHIFT); - tmp |= (val & ADDA_PR_DATA_IN_MASK) << ADDA_PR_DATA_IN_SHIFT; - writel(tmp, base); - - /* Set write bit to signal a write */ - writel(readl(base) | ADDA_PR_WRITE, base); - - /* Clear write bit */ - writel(readl(base) & ~ADDA_PR_WRITE, base); - - return 0; -} - -static const struct regmap_config adda_pr_regmap_cfg = { - .name = "adda-pr", - .reg_bits = 5, - .reg_stride = 1, - .val_bits = 8, - .reg_read = adda_reg_read, - .reg_write = adda_reg_write, - .fast_io = true, - .max_register = 24, -}; - /* mixer controls */ static const struct snd_kcontrol_new sun8i_codec_mixer_controls[] = { SOC_DAPM_DOUBLE_R("DAC Playback Switch", @@ -912,7 +839,7 @@ static int sun8i_codec_analog_probe(struct platform_device *pdev) return PTR_ERR(base); } - regmap = devm_regmap_init(&pdev->dev, NULL, base, &adda_pr_regmap_cfg); + regmap = sun8i_adda_pr_regmap_init(&pdev->dev, base); if (IS_ERR(regmap)) { dev_err(&pdev->dev, "Failed to create regmap\n"); return PTR_ERR(regmap); diff --git a/sound/soc/sunxi/sun8i-codec.c b/sound/soc/sunxi/sun8i-codec.c index fb37dd927e33..522a72fde78d 100644 --- a/sound/soc/sunxi/sun8i-codec.c +++ b/sound/soc/sunxi/sun8i-codec.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,6 @@ #define SUN8I_AIF1CLK_CTRL_AIF1_LRCK_INV 13 #define SUN8I_AIF1CLK_CTRL_AIF1_BCLK_DIV 9 #define SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV 6 -#define SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV_16 (1 << 6) #define SUN8I_AIF1CLK_CTRL_AIF1_WORD_SIZ 4 #define SUN8I_AIF1CLK_CTRL_AIF1_WORD_SIZ_16 (1 << 4) #define SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT 2 @@ -300,12 +300,23 @@ static u8 sun8i_codec_get_bclk_div(struct sun8i_codec *scodec, return best_val; } +static int sun8i_codec_get_lrck_div(unsigned int channels, + unsigned int word_size) +{ + unsigned int div = word_size * channels; + + if (div < 16 || div > 256) + return -EINVAL; + + return ilog2(div) - 4; +} + static int sun8i_codec_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) { struct sun8i_codec *scodec = snd_soc_component_get_drvdata(dai->component); - int sample_rate; + int sample_rate, lrck_div; u8 bclk_div; /* @@ -321,9 +332,14 @@ static int sun8i_codec_hw_params(struct snd_pcm_substream *substream, SUN8I_AIF1CLK_CTRL_AIF1_BCLK_DIV_MASK, bclk_div << SUN8I_AIF1CLK_CTRL_AIF1_BCLK_DIV); + lrck_div = sun8i_codec_get_lrck_div(params_channels(params), + params_physical_width(params)); + if (lrck_div < 0) + return lrck_div; + regmap_update_bits(scodec->regmap, SUN8I_AIF1CLK_CTRL, SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV_MASK, - SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV_16); + lrck_div << SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV); sample_rate = sun8i_codec_get_hw_rate(params); if (sample_rate < 0) diff --git a/sound/soc/tegra/tegra_sgtl5000.c b/sound/soc/tegra/tegra_sgtl5000.c index 45a4aa9d2a47..901457da25ec 100644 --- a/sound/soc/tegra/tegra_sgtl5000.c +++ b/sound/soc/tegra/tegra_sgtl5000.c @@ -149,14 +149,14 @@ static int tegra_sgtl5000_driver_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Property 'nvidia,i2s-controller' missing/invalid\n"); ret = -EINVAL; - goto err; + goto err_put_codec_of_node; } tegra_sgtl5000_dai.platform_of_node = tegra_sgtl5000_dai.cpu_of_node; ret = tegra_asoc_utils_init(&machine->util_data, &pdev->dev); if (ret) - goto err; + goto err_put_cpu_of_node; ret = snd_soc_register_card(card); if (ret) { @@ -169,6 +169,13 @@ static int tegra_sgtl5000_driver_probe(struct platform_device *pdev) err_fini_utils: tegra_asoc_utils_fini(&machine->util_data); +err_put_cpu_of_node: + of_node_put(tegra_sgtl5000_dai.cpu_of_node); + tegra_sgtl5000_dai.cpu_of_node = NULL; + tegra_sgtl5000_dai.platform_of_node = NULL; +err_put_codec_of_node: + of_node_put(tegra_sgtl5000_dai.codec_of_node); + tegra_sgtl5000_dai.codec_of_node = NULL; err: return ret; } @@ -183,6 +190,12 @@ static int tegra_sgtl5000_driver_remove(struct platform_device *pdev) tegra_asoc_utils_fini(&machine->util_data); + of_node_put(tegra_sgtl5000_dai.cpu_of_node); + tegra_sgtl5000_dai.cpu_of_node = NULL; + tegra_sgtl5000_dai.platform_of_node = NULL; + of_node_put(tegra_sgtl5000_dai.codec_of_node); + tegra_sgtl5000_dai.codec_of_node = NULL; + return ret; } diff --git a/sound/soc/txx9/txx9aclc-ac97.c b/sound/soc/txx9/txx9aclc-ac97.c index e2ad00e3cae1..1cfca698ae4b 100644 --- a/sound/soc/txx9/txx9aclc-ac97.c +++ b/sound/soc/txx9/txx9aclc-ac97.c @@ -208,13 +208,12 @@ static int txx9aclc_ac97_dev_probe(struct platform_device *pdev) if (err < 0) return err; - return snd_soc_register_component(&pdev->dev, &txx9aclc_ac97_component, + return devm_snd_soc_register_component(&pdev->dev, &txx9aclc_ac97_component, &txx9aclc_ac97_dai, 1); } static int txx9aclc_ac97_dev_remove(struct platform_device *pdev) { - snd_soc_unregister_component(&pdev->dev); snd_soc_set_ac97_ops(NULL); return 0; } diff --git a/sound/usb/caiaq/device.c b/sound/usb/caiaq/device.c index d55ca48de3ea..f4a72e39ffa9 100644 --- a/sound/usb/caiaq/device.c +++ b/sound/usb/caiaq/device.c @@ -200,6 +200,7 @@ static void usb_ep1_command_reply_dispatch (struct urb* urb) break; } #ifdef CONFIG_SND_USB_CAIAQ_INPUT + /* fall through */ case EP1_CMD_READ_ERP: case EP1_CMD_READ_ANALOG: snd_usb_caiaq_input_dispatch(cdev, buf, urb->actual_length); diff --git a/sound/usb/midi.c b/sound/usb/midi.c index dcfc546d81b9..b737f0ec77d0 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1175,8 +1175,7 @@ static void snd_usbmidi_output_trigger(struct snd_rawmidi_substream *substream, if (port->ep->umidi->disconnected) { /* gobble up remaining bytes to prevent wait in * snd_rawmidi_drain_output */ - while (!snd_rawmidi_transmit_empty(substream)) - snd_rawmidi_transmit_ack(substream, 1); + snd_rawmidi_proceed(substream); return; } tasklet_schedule(&port->ep->tasklet); diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index cbfb48bdea51..85ae0ff2382a 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -1817,6 +1818,380 @@ static int dell_dock_mixer_init(struct usb_mixer_interface *mixer) return 0; } +/* RME Class Compliant device quirks */ + +#define SND_RME_GET_STATUS1 23 +#define SND_RME_GET_CURRENT_FREQ 17 +#define SND_RME_CLK_SYSTEM_SHIFT 16 +#define SND_RME_CLK_SYSTEM_MASK 0x1f +#define SND_RME_CLK_AES_SHIFT 8 +#define SND_RME_CLK_SPDIF_SHIFT 12 +#define SND_RME_CLK_AES_SPDIF_MASK 0xf +#define SND_RME_CLK_SYNC_SHIFT 6 +#define SND_RME_CLK_SYNC_MASK 0x3 +#define SND_RME_CLK_FREQMUL_SHIFT 18 +#define SND_RME_CLK_FREQMUL_MASK 0x7 +#define SND_RME_CLK_SYSTEM(x) \ + ((x >> SND_RME_CLK_SYSTEM_SHIFT) & SND_RME_CLK_SYSTEM_MASK) +#define SND_RME_CLK_AES(x) \ + ((x >> SND_RME_CLK_AES_SHIFT) & SND_RME_CLK_AES_SPDIF_MASK) +#define SND_RME_CLK_SPDIF(x) \ + ((x >> SND_RME_CLK_SPDIF_SHIFT) & SND_RME_CLK_AES_SPDIF_MASK) +#define SND_RME_CLK_SYNC(x) \ + ((x >> SND_RME_CLK_SYNC_SHIFT) & SND_RME_CLK_SYNC_MASK) +#define SND_RME_CLK_FREQMUL(x) \ + ((x >> SND_RME_CLK_FREQMUL_SHIFT) & SND_RME_CLK_FREQMUL_MASK) +#define SND_RME_CLK_AES_LOCK 0x1 +#define SND_RME_CLK_AES_SYNC 0x4 +#define SND_RME_CLK_SPDIF_LOCK 0x2 +#define SND_RME_CLK_SPDIF_SYNC 0x8 +#define SND_RME_SPDIF_IF_SHIFT 4 +#define SND_RME_SPDIF_FORMAT_SHIFT 5 +#define SND_RME_BINARY_MASK 0x1 +#define SND_RME_SPDIF_IF(x) \ + ((x >> SND_RME_SPDIF_IF_SHIFT) & SND_RME_BINARY_MASK) +#define SND_RME_SPDIF_FORMAT(x) \ + ((x >> SND_RME_SPDIF_FORMAT_SHIFT) & SND_RME_BINARY_MASK) + +static const u32 snd_rme_rate_table[] = { + 32000, 44100, 48000, 50000, + 64000, 88200, 96000, 100000, + 128000, 176400, 192000, 200000, + 256000, 352800, 384000, 400000, + 512000, 705600, 768000, 800000 +}; +/* maximum number of items for AES and S/PDIF rates for above table */ +#define SND_RME_RATE_IDX_AES_SPDIF_NUM 12 + +enum snd_rme_domain { + SND_RME_DOMAIN_SYSTEM, + SND_RME_DOMAIN_AES, + SND_RME_DOMAIN_SPDIF +}; + +enum snd_rme_clock_status { + SND_RME_CLOCK_NOLOCK, + SND_RME_CLOCK_LOCK, + SND_RME_CLOCK_SYNC +}; + +static int snd_rme_read_value(struct snd_usb_audio *chip, + unsigned int item, + u32 *value) +{ + struct usb_device *dev = chip->dev; + int err; + + err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), + item, + USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, + 0, 0, + value, sizeof(*value)); + if (err < 0) + dev_err(&dev->dev, + "unable to issue vendor read request %d (ret = %d)", + item, err); + return err; +} + +static int snd_rme_get_status1(struct snd_kcontrol *kcontrol, + u32 *status1) +{ + struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); + struct snd_usb_audio *chip = list->mixer->chip; + int err; + + err = snd_usb_lock_shutdown(chip); + if (err < 0) + return err; + err = snd_rme_read_value(chip, SND_RME_GET_STATUS1, status1); + snd_usb_unlock_shutdown(chip); + return err; +} + +static int snd_rme_rate_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + u32 status1; + u32 rate = 0; + int idx; + int err; + + err = snd_rme_get_status1(kcontrol, &status1); + if (err < 0) + return err; + switch (kcontrol->private_value) { + case SND_RME_DOMAIN_SYSTEM: + idx = SND_RME_CLK_SYSTEM(status1); + if (idx < ARRAY_SIZE(snd_rme_rate_table)) + rate = snd_rme_rate_table[idx]; + break; + case SND_RME_DOMAIN_AES: + idx = SND_RME_CLK_AES(status1); + if (idx < SND_RME_RATE_IDX_AES_SPDIF_NUM) + rate = snd_rme_rate_table[idx]; + break; + case SND_RME_DOMAIN_SPDIF: + idx = SND_RME_CLK_SPDIF(status1); + if (idx < SND_RME_RATE_IDX_AES_SPDIF_NUM) + rate = snd_rme_rate_table[idx]; + break; + default: + return -EINVAL; + } + ucontrol->value.integer.value[0] = rate; + return 0; +} + +static int snd_rme_sync_state_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + u32 status1; + int idx = SND_RME_CLOCK_NOLOCK; + int err; + + err = snd_rme_get_status1(kcontrol, &status1); + if (err < 0) + return err; + switch (kcontrol->private_value) { + case SND_RME_DOMAIN_AES: /* AES */ + if (status1 & SND_RME_CLK_AES_SYNC) + idx = SND_RME_CLOCK_SYNC; + else if (status1 & SND_RME_CLK_AES_LOCK) + idx = SND_RME_CLOCK_LOCK; + break; + case SND_RME_DOMAIN_SPDIF: /* SPDIF */ + if (status1 & SND_RME_CLK_SPDIF_SYNC) + idx = SND_RME_CLOCK_SYNC; + else if (status1 & SND_RME_CLK_SPDIF_LOCK) + idx = SND_RME_CLOCK_LOCK; + break; + default: + return -EINVAL; + } + ucontrol->value.enumerated.item[0] = idx; + return 0; +} + +static int snd_rme_spdif_if_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + u32 status1; + int err; + + err = snd_rme_get_status1(kcontrol, &status1); + if (err < 0) + return err; + ucontrol->value.enumerated.item[0] = SND_RME_SPDIF_IF(status1); + return 0; +} + +static int snd_rme_spdif_format_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + u32 status1; + int err; + + err = snd_rme_get_status1(kcontrol, &status1); + if (err < 0) + return err; + ucontrol->value.enumerated.item[0] = SND_RME_SPDIF_FORMAT(status1); + return 0; +} + +static int snd_rme_sync_source_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + u32 status1; + int err; + + err = snd_rme_get_status1(kcontrol, &status1); + if (err < 0) + return err; + ucontrol->value.enumerated.item[0] = SND_RME_CLK_SYNC(status1); + return 0; +} + +static int snd_rme_current_freq_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); + struct snd_usb_audio *chip = list->mixer->chip; + u32 status1; + const u64 num = 104857600000000ULL; + u32 den; + unsigned int freq; + int err; + + err = snd_usb_lock_shutdown(chip); + if (err < 0) + return err; + err = snd_rme_read_value(chip, SND_RME_GET_STATUS1, &status1); + if (err < 0) + goto end; + err = snd_rme_read_value(chip, SND_RME_GET_CURRENT_FREQ, &den); + if (err < 0) + goto end; + freq = (den == 0) ? 0 : div64_u64(num, den); + freq <<= SND_RME_CLK_FREQMUL(status1); + ucontrol->value.integer.value[0] = freq; + +end: + snd_usb_unlock_shutdown(chip); + return err; +} + +static int snd_rme_rate_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; + switch (kcontrol->private_value) { + case SND_RME_DOMAIN_SYSTEM: + uinfo->value.integer.min = 32000; + uinfo->value.integer.max = 800000; + break; + case SND_RME_DOMAIN_AES: + case SND_RME_DOMAIN_SPDIF: + default: + uinfo->value.integer.min = 0; + uinfo->value.integer.max = 200000; + } + uinfo->value.integer.step = 0; + return 0; +} + +static int snd_rme_sync_state_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static const char *const sync_states[] = { + "No Lock", "Lock", "Sync" + }; + + return snd_ctl_enum_info(uinfo, 1, + ARRAY_SIZE(sync_states), sync_states); +} + +static int snd_rme_spdif_if_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static const char *const spdif_if[] = { + "Coaxial", "Optical" + }; + + return snd_ctl_enum_info(uinfo, 1, + ARRAY_SIZE(spdif_if), spdif_if); +} + +static int snd_rme_spdif_format_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static const char *const optical_type[] = { + "Consumer", "Professional" + }; + + return snd_ctl_enum_info(uinfo, 1, + ARRAY_SIZE(optical_type), optical_type); +} + +static int snd_rme_sync_source_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static const char *const sync_sources[] = { + "Internal", "AES", "SPDIF", "Internal" + }; + + return snd_ctl_enum_info(uinfo, 1, + ARRAY_SIZE(sync_sources), sync_sources); +} + +static struct snd_kcontrol_new snd_rme_controls[] = { + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "AES Rate", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_rate_info, + .get = snd_rme_rate_get, + .private_value = SND_RME_DOMAIN_AES + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "AES Sync", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_sync_state_info, + .get = snd_rme_sync_state_get, + .private_value = SND_RME_DOMAIN_AES + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "SPDIF Rate", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_rate_info, + .get = snd_rme_rate_get, + .private_value = SND_RME_DOMAIN_SPDIF + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "SPDIF Sync", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_sync_state_info, + .get = snd_rme_sync_state_get, + .private_value = SND_RME_DOMAIN_SPDIF + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "SPDIF Interface", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_spdif_if_info, + .get = snd_rme_spdif_if_get, + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "SPDIF Format", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_spdif_format_info, + .get = snd_rme_spdif_format_get, + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "Sync Source", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_sync_source_info, + .get = snd_rme_sync_source_get + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "System Rate", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_rate_info, + .get = snd_rme_rate_get, + .private_value = SND_RME_DOMAIN_SYSTEM + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "Current Frequency", + .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, + .info = snd_rme_rate_info, + .get = snd_rme_current_freq_get + } +}; + +static int snd_rme_controls_create(struct usb_mixer_interface *mixer) +{ + int err, i; + + for (i = 0; i < ARRAY_SIZE(snd_rme_controls); ++i) { + err = add_single_ctl_with_resume(mixer, 0, + NULL, + &snd_rme_controls[i], + NULL); + if (err < 0) + return err; + } + + return 0; +} + int snd_usb_mixer_apply_create_quirk(struct usb_mixer_interface *mixer) { int err = 0; @@ -1904,6 +2279,12 @@ int snd_usb_mixer_apply_create_quirk(struct usb_mixer_interface *mixer) case USB_ID(0x0bda, 0x4014): /* Dell WD15 dock */ err = dell_dock_mixer_init(mixer); break; + + case USB_ID(0x2a39, 0x3fd2): /* RME ADI-2 Pro */ + case USB_ID(0x2a39, 0x3fd3): /* RME ADI-2 DAC */ + case USB_ID(0x2a39, 0x3fd4): /* RME */ + err = snd_rme_controls_create(mixer); + break; } return err; diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 08aa78007020..849953e5775c 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -3346,19 +3346,14 @@ AU0828_DEVICE(0x2040, 0x7270, "Hauppauge", "HVR-950Q"), .ifnum = 0, .type = QUIRK_AUDIO_STANDARD_MIXER, }, - /* Capture */ - { - .ifnum = 1, - .type = QUIRK_IGNORE_INTERFACE, - }, /* Playback */ { - .ifnum = 2, + .ifnum = 1, .type = QUIRK_AUDIO_FIXED_ENDPOINT, .data = &(const struct audioformat) { .formats = SNDRV_PCM_FMTBIT_S16_LE, .channels = 2, - .iface = 2, + .iface = 1, .altsetting = 1, .altset_idx = 1, .attributes = UAC_EP_CS_ATTR_FILL_MAX | diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c index fa7dca5a68c8..83d76c345940 100644 --- a/sound/x86/intel_hdmi_audio.c +++ b/sound/x86/intel_hdmi_audio.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1141,8 +1140,7 @@ static int had_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *hw_params) { struct snd_intelhad *intelhaddata; - unsigned long addr; - int pages, buf_size, retval; + int buf_size, retval; intelhaddata = snd_pcm_substream_chip(substream); buf_size = params_buffer_bytes(hw_params); @@ -1151,17 +1149,6 @@ static int had_pcm_hw_params(struct snd_pcm_substream *substream, return retval; dev_dbg(intelhaddata->dev, "%s:allocated memory = %d\n", __func__, buf_size); - /* mark the pages as uncached region */ - addr = (unsigned long) substream->runtime->dma_area; - pages = (substream->runtime->dma_bytes + PAGE_SIZE - 1) / PAGE_SIZE; - retval = set_memory_uc(addr, pages); - if (retval) { - dev_err(intelhaddata->dev, "set_memory_uc failed.Error:%d\n", - retval); - return retval; - } - memset(substream->runtime->dma_area, 0, buf_size); - return retval; } @@ -1171,21 +1158,11 @@ static int had_pcm_hw_params(struct snd_pcm_substream *substream, static int had_pcm_hw_free(struct snd_pcm_substream *substream) { struct snd_intelhad *intelhaddata; - unsigned long addr; - u32 pages; intelhaddata = snd_pcm_substream_chip(substream); had_do_reset(intelhaddata); - /* mark back the pages as cached/writeback region before the free */ - if (substream->runtime->dma_area != NULL) { - addr = (unsigned long) substream->runtime->dma_area; - pages = (substream->runtime->dma_bytes + PAGE_SIZE - 1) / - PAGE_SIZE; - set_memory_wb(addr, pages); - return snd_pcm_lib_free_pages(substream); - } - return 0; + return snd_pcm_lib_free_pages(substream); } /* @@ -1860,7 +1837,7 @@ static int hdmi_lpe_audio_probe(struct platform_device *pdev) * try to allocate 600k buffer as default which is large enough */ snd_pcm_lib_preallocate_pages_for_all(pcm, - SNDRV_DMA_TYPE_DEV, NULL, + SNDRV_DMA_TYPE_DEV_UC, NULL, HAD_DEFAULT_BUFFER, HAD_MAX_BUFFER); /* create controls */ diff --git a/sound/xen/xen_snd_front_alsa.c b/sound/xen/xen_snd_front_alsa.c index 129180e17db1..2cbd9679aca1 100644 --- a/sound/xen/xen_snd_front_alsa.c +++ b/sound/xen/xen_snd_front_alsa.c @@ -637,31 +637,31 @@ static int alsa_pb_fill_silence(struct snd_pcm_substream *substream, * to know when the buffer can be transferred to the backend. */ -static struct snd_pcm_ops snd_drv_alsa_playback_ops = { - .open = alsa_open, - .close = alsa_close, - .ioctl = snd_pcm_lib_ioctl, - .hw_params = alsa_hw_params, - .hw_free = alsa_hw_free, - .prepare = alsa_prepare, - .trigger = alsa_trigger, - .pointer = alsa_pointer, - .copy_user = alsa_pb_copy_user, - .copy_kernel = alsa_pb_copy_kernel, - .fill_silence = alsa_pb_fill_silence, +static const struct snd_pcm_ops snd_drv_alsa_playback_ops = { + .open = alsa_open, + .close = alsa_close, + .ioctl = snd_pcm_lib_ioctl, + .hw_params = alsa_hw_params, + .hw_free = alsa_hw_free, + .prepare = alsa_prepare, + .trigger = alsa_trigger, + .pointer = alsa_pointer, + .copy_user = alsa_pb_copy_user, + .copy_kernel = alsa_pb_copy_kernel, + .fill_silence = alsa_pb_fill_silence, }; -static struct snd_pcm_ops snd_drv_alsa_capture_ops = { - .open = alsa_open, - .close = alsa_close, - .ioctl = snd_pcm_lib_ioctl, - .hw_params = alsa_hw_params, - .hw_free = alsa_hw_free, - .prepare = alsa_prepare, - .trigger = alsa_trigger, - .pointer = alsa_pointer, - .copy_user = alsa_cap_copy_user, - .copy_kernel = alsa_cap_copy_kernel, +static const struct snd_pcm_ops snd_drv_alsa_capture_ops = { + .open = alsa_open, + .close = alsa_close, + .ioctl = snd_pcm_lib_ioctl, + .hw_params = alsa_hw_params, + .hw_free = alsa_hw_free, + .prepare = alsa_prepare, + .trigger = alsa_trigger, + .pointer = alsa_pointer, + .copy_user = alsa_cap_copy_user, + .copy_kernel = alsa_cap_copy_kernel, }; static int new_pcm_instance(struct xen_snd_front_card_info *card_info, diff --git a/tools/Makefile b/tools/Makefile index be02c8b904db..abb358a70ad0 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -21,6 +21,7 @@ help: @echo ' leds - LEDs tools' @echo ' liblockdep - user-space wrapper for kernel locking-validator' @echo ' bpf - misc BPF tools' + @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' @echo ' spi - spi tools' @@ -59,7 +60,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup firewire hv guest spi usb virtio vm bpf iio gpio objtool leds wmi: FORCE +cgroup firewire hv guest spi usb virtio vm bpf iio gpio objtool leds wmi pci: FORCE $(call descend,$@) liblockdep: FORCE @@ -94,7 +95,7 @@ kvm_stat: FORCE all: acpi cgroup cpupower gpio hv firewire liblockdep \ perf selftests spi turbostat usb \ virtio vm bpf x86_energy_perf_policy \ - tmon freefall iio objtool kvm_stat wmi + tmon freefall iio objtool kvm_stat wmi pci acpi_install: $(call descend,power/$(@:_install=),install) @@ -102,7 +103,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install: +cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install: $(call descend,$(@:_install=),install) liblockdep_install: @@ -128,7 +129,7 @@ install: acpi_install cgroup_install cpupower_install gpio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install vm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ - wmi_install + wmi_install pci_install acpi_clean: $(call descend,power/acpi,clean) @@ -136,7 +137,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean: +cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean: $(call descend,$(@:_clean=),clean) liblockdep_clean: @@ -174,6 +175,6 @@ clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \ - gpio_clean objtool_clean leds_clean wmi_clean + gpio_clean objtool_clean leds_clean wmi_clean pci_clean .PHONY: FORCE diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 9f420d98b5fb..8cb504d30384 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms(t->freepages_delay_total, t->freepages_count)); + average_ms(t->freepages_delay_total, t->freepages_count), + "count", "delay total", "delay average", + (unsigned long long)t->thrashing_count, + (unsigned long long)t->thrashing_delay_total, + average_ms(t->thrashing_delay_total, t->thrashing_count)); } static void task_context_switch_counts(struct taskstats *t) diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h index 40bde6b23501..378c051fa177 100644 --- a/tools/arch/arm64/include/asm/barrier.h +++ b/tools/arch/arm64/include/asm/barrier.h @@ -14,4 +14,75 @@ #define wmb() asm volatile("dmb ishst" ::: "memory") #define rmb() asm volatile("dmb ishld" ::: "memory") +#define smp_store_release(p, v) \ +do { \ + union { typeof(*p) __val; char __c[1]; } __u = \ + { .__val = (v) }; \ + \ + switch (sizeof(*p)) { \ + case 1: \ + asm volatile ("stlrb %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u8_alias_t *)__u.__c) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile ("stlrh %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u16_alias_t *)__u.__c) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile ("stlr %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u32_alias_t *)__u.__c) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile ("stlr %1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u64_alias_t *)__u.__c) \ + : "memory"); \ + break; \ + default: \ + /* Only to shut up gcc ... */ \ + mb(); \ + break; \ + } \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + union { typeof(*p) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + \ + switch (sizeof(*p)) { \ + case 1: \ + asm volatile ("ldarb %w0, %1" \ + : "=r" (*(__u8_alias_t *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 2: \ + asm volatile ("ldarh %w0, %1" \ + : "=r" (*(__u16_alias_t *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 4: \ + asm volatile ("ldar %w0, %1" \ + : "=r" (*(__u32_alias_t *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 8: \ + asm volatile ("ldar %0, %1" \ + : "=r" (*(__u64_alias_t *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + default: \ + /* Only to shut up gcc ... */ \ + mb(); \ + break; \ + } \ + __u.__val; \ +}) + #endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */ diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index 5072cbd15c82..dae1584cf017 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -16,5 +16,6 @@ */ #define __ARCH_WANT_RENAMEAT +#define __ARCH_WANT_NEW_STAT #include diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h index d808ee0e77b5..4d471d9511a5 100644 --- a/tools/arch/ia64/include/asm/barrier.h +++ b/tools/arch/ia64/include/asm/barrier.h @@ -46,4 +46,17 @@ #define rmb() mb() #define wmb() mb() +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */ diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h index a634da05bc97..905a2c66d96d 100644 --- a/tools/arch/powerpc/include/asm/barrier.h +++ b/tools/arch/powerpc/include/asm/barrier.h @@ -27,4 +27,20 @@ #define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory") +#if defined(__powerpc64__) +#define smp_lwsync() __asm__ __volatile__ ("lwsync" : : : "memory") + +#define smp_store_release(p, v) \ +do { \ + smp_lwsync(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + smp_lwsync(); \ + ___p1; \ +}) +#endif /* defined(__powerpc64__) */ #endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */ diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1b32b56a03d3..8c876c166ef2 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h index 5030c99f47d2..de362fa664d4 100644 --- a/tools/arch/s390/include/asm/barrier.h +++ b/tools/arch/s390/include/asm/barrier.h @@ -28,4 +28,17 @@ #define rmb() mb() #define wmb() mb() +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* __TOOLS_LIB_ASM_BARRIER_H */ diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 9a50f02b9894..16511d97e8dc 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 /* kvm attributes for migration mode */ #define KVM_S390_VM_MIGRATION_STOP 0 diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h index ba61344287d5..cfb0fdc8ccf0 100644 --- a/tools/arch/sparc/include/asm/barrier_64.h +++ b/tools/arch/sparc/include/asm/barrier_64.h @@ -40,4 +40,17 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define rmb() __asm__ __volatile__("":::"memory") #define wmb() __asm__ __volatile__("":::"memory") +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */ diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h index 8774dee27471..58919868473c 100644 --- a/tools/arch/x86/include/asm/barrier.h +++ b/tools/arch/x86/include/asm/barrier.h @@ -26,4 +26,18 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif +#if defined(__x86_64__) +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) +#endif /* defined(__x86_64__) */ #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..dabfcf7c3941 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -288,6 +288,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -299,7 +300,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + __u8 pending; __u32 error_code; } exception; struct { @@ -322,7 +323,9 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u32 reserved[9]; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; /* for KVM_GET/SET_DEBUGREGS */ @@ -377,9 +380,11 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index a6258bc8ec4f..f55a2daed59b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -15,13 +15,15 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } *COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **help** } MAP COMMANDS ============= | **bpftool** **map { show | list }** [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **bpftool** **map dump** *MAP* | **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] | **bpftool** **map lookup** *MAP* **key** *DATA* @@ -36,6 +38,11 @@ MAP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** } DESCRIPTION =========== @@ -47,6 +54,10 @@ DESCRIPTION Output will start with map ID followed by map type and zero or more named attributes (depending on kernel version). + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* + as *FILE*. + **bpftool map dump** *MAP* Dump all entries in a given *MAP*. @@ -75,7 +86,9 @@ DESCRIPTION **bpftool map pin** *MAP* *FILE* Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst new file mode 100644 index 000000000000..408ec30d8872 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -0,0 +1,139 @@ +================ +bpftool-net +================ +------------------------------------------------------------------------------- +tool for inspection of netdev/tc related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **net** *COMMAND* + + *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + + *COMMANDS* := + { **show** | **list** } [ **dev** name ] | **help** + +NET COMMANDS +============ + +| **bpftool** **net { show | list } [ dev name ]** +| **bpftool** **net help** + +DESCRIPTION +=========== + **bpftool net { show | list } [ dev name ]** + List bpf program attachments in the kernel networking subsystem. + + Currently, only device driver xdp attachments and tc filter + classification/action attachments are implemented, i.e., for + program types **BPF_PROG_TYPE_SCHED_CLS**, + **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**. + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + users can use **bpftool cgroup** to dump cgroup attachments. + For sk_{filter, skb, msg, reuseport} and lwt/seg6 + bpf programs, users should consult other tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tc class/qdisc bpf program attachments. Both xdp programs and + tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc filter**, + the order will be first all bpf programs attached to tc classes, then + all bpf programs attached to non clsact qdiscs, and finally all + bpf programs attached to root and clsact qdisc. + + **bpftool net help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -v, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +EXAMPLES +======== + +| **# bpftool net** + +:: + + xdp: + eth0(2) driver id 198 + + tc: + eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act [] + eth0(2) clsact/ingress fbflow_icmp id 130246 act [] + eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726 + eth0(2) clsact/egress cls_fg_dscp id 108619 act [] + eth0(2) clsact/egress fbflow_egress id 130245 + +| +| **# bpftool -jp net** + +:: + + [{ + "xdp": [{ + "devname": "eth0", + "ifindex": 2, + "mode": "driver", + "id": 198 + } + ], + "tc": [{ + "devname": "eth0", + "ifindex": 2, + "kind": "htb", + "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", + "id": 111727, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/ingress", + "name": "fbflow_icmp", + "id": 130246, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", + "id": 111726, + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "cls_fg_dscp", + "id": 108619, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "fbflow_egress", + "id": 130245, + } + ] + } + ] + + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 64156a16d530..ac4e904b10fb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -25,6 +25,8 @@ MAP COMMANDS | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes**}] | **bpftool** **prog pin** *PROG* *FILE* | **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP* +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP* | **bpftool** **prog help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -37,6 +39,7 @@ MAP COMMANDS | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | } +| *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** } DESCRIPTION @@ -72,7 +75,9 @@ DESCRIPTION **bpftool prog pin** *PROG* *FILE* Pin program *PROG* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. **bpftool prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] Load bpf program from binary *OBJ* and pin as *FILE*. @@ -88,7 +93,17 @@ DESCRIPTION If **dev** *NAME* is specified program will be loaded onto given networking device (offload). - Note: *FILE* must be located in *bpffs* mount. + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP* + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + to the map *MAP*. + + **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP* + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + from the map *MAP*. **bpftool prog help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index b6f5d560460d..04cd4f92ab89 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,22 +16,24 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** | **perf** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } *MAP-COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **event_pipe** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** - | **load** | **help** } + | **load** | **attach** | **detach** | **help** } *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } *PERF-COMMANDS* := { **show** | **list** | **help** } + *NET-COMMANDS* := { **show** | **list** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects @@ -55,7 +57,11 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -m, --mapcompat + Allow loading maps with unknown map definitions. + + SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), **bpftool-net**\ (8) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 74288a2197ab..dac7eff4c7e5 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -46,6 +46,13 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ -I$(srctree)/tools/lib/bpf \ -I$(srctree)/tools/perf CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' +ifneq ($(EXTRA_CFLAGS),) +CFLAGS += $(EXTRA_CFLAGS) +endif +ifneq ($(EXTRA_LDFLAGS),) +LDFLAGS += $(EXTRA_LDFLAGS) +endif + LIBS = -lelf -lbfd -lopcodes $(LIBBPF) INSTALL ?= install @@ -90,7 +97,7 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) - $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS) $(OUTPUT)%.o: %.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 598066c40191..3f78e6404589 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -143,7 +143,7 @@ _bpftool_map_update_map_type() local type type=$(bpftool -jp map show $keyword $ref | \ command sed -n 's/.*"type": "\(.*\)",$/\1/p') - printf $type + [[ -n $type ]] && printf $type } _bpftool_map_update_get_id() @@ -184,7 +184,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then - local c='--version --json --pretty --bpffs' + local c='--version --json --pretty --bpffs --mapcompat' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -292,6 +292,23 @@ _bpftool() fi return 0 ;; + attach|detach) + if [[ ${#words[@]} == 7 ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + + if [[ ${#words[@]} == 6 ]]; then + COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) ) + return 0 + fi + + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + return 0 + ;; load) local obj @@ -347,7 +364,7 @@ _bpftool() ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'dump help pin load \ + COMPREPLY=( $( compgen -W 'dump help pin attach detach load \ show list' -- "$cur" ) ) ;; esac @@ -370,6 +387,42 @@ _bpftool() ;; esac ;; + create) + case $prev in + $command) + _filedir + return 0 + ;; + type) + COMPREPLY=( $( compgen -W 'hash array prog_array \ + perf_event_array percpu_hash percpu_array \ + stack_trace cgroup_array lru_hash \ + lru_percpu_hash lpm_trie array_of_maps \ + hash_of_maps devmap sockmap cpumap xskmap \ + sockhash cgroup_storage reuseport_sockarray \ + percpu_cgroup_storage' -- \ + "$cur" ) ) + return 0 + ;; + key|value|flags|name|entries) + return 0 + ;; + dev) + _sysfs_get_netdevs + return 0 + ;; + *) + _bpftool_once_attr 'type' + _bpftool_once_attr 'key' + _bpftool_once_attr 'value' + _bpftool_once_attr 'entries' + _bpftool_once_attr 'name' + _bpftool_once_attr 'flags' + _bpftool_once_attr 'dev' + return 0 + ;; + esac + ;; lookup|getnext|delete) case $prev in $command) @@ -483,7 +536,7 @@ _bpftool() *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin event_pipe show list update' -- \ + lookup pin event_pipe show list update create' -- \ "$cur" ) ) ;; esac @@ -494,10 +547,10 @@ _bpftool() _filedir return 0 ;; - tree) - _filedir - return 0 - ;; + tree) + _filedir + return 0 + ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ device bind4 bind6 post_bind4 post_bind6 connect4 \ @@ -552,6 +605,15 @@ _bpftool() ;; esac ;; + net) + case $command in + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b3a0709ea7ed..25af85304ebe 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -554,7 +554,9 @@ static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name) return read_sysfs_hex_int(full_path); } -const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) +const char * +ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, + const char **opt) { char devname[IF_NAMESIZE]; int vendor_id; @@ -579,6 +581,7 @@ const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) device_id != 0x6000 && device_id != 0x6003) p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch"); + *opt = "ctx4"; return "NFP-6xxx"; default: p_err("Can't get bfd arch name for device vendor id 0x%04x", @@ -618,3 +621,24 @@ void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) jsonw_string_field(json_wtr, "ifname", name); jsonw_end_object(json_wtr); } + +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what) +{ + char *endptr; + + NEXT_ARGP(); + + if (*val) { + p_err("%s already specified", what); + return -1; + } + + *val = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as %s", **argv, what); + return -1; + } + NEXT_ARGP(); + + return 0; +} diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index 87439320ef70..c75ffd9ce2bb 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -77,7 +77,7 @@ static int fprintf_json(void *out, const char *fmt, ...) } void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, - const char *arch) + const char *arch, const char *disassembler_options) { disassembler_ftype disassemble; struct disassemble_info info; @@ -116,6 +116,8 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, info.arch = bfd_get_arch(bfdf); info.mach = bfd_get_mach(bfdf); + if (disassembler_options) + info.disassembler_options = disassembler_options; info.buffer = image; info.buffer_length = len; diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index d15a62be6cf0..75a3296dc0bc 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -55,6 +55,7 @@ json_writer_t *json_wtr; bool pretty_output; bool json_output; bool show_pinned; +int bpf_flags; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; @@ -85,7 +86,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf }\n" + " OBJECT := { prog | map | cgroup | perf | net }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -215,6 +216,7 @@ static const struct cmd cmds[] = { { "map", do_map }, { "cgroup", do_cgroup }, { "perf", do_perf }, + { "net", do_net }, { "version", do_version }, { 0 } }; @@ -319,7 +321,8 @@ static int do_batch(int argc, char **argv) p_err("reading batch file failed: %s", strerror(errno)); err = -1; } else { - p_info("processed %d commands", lines); + if (!json_output) + printf("processed %d commands\n", lines); err = 0; } err_close: @@ -340,6 +343,7 @@ int main(int argc, char **argv) { "pretty", no_argument, NULL, 'p' }, { "version", no_argument, NULL, 'V' }, { "bpffs", no_argument, NULL, 'f' }, + { "mapcompat", no_argument, NULL, 'm' }, { 0 } }; int opt, ret; @@ -354,7 +358,7 @@ int main(int argc, char **argv) hash_init(map_table.table); opterr = 0; - while ((opt = getopt_long(argc, argv, "Vhpjf", + while ((opt = getopt_long(argc, argv, "Vhpjfm", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -378,6 +382,9 @@ int main(int argc, char **argv) case 'f': show_pinned = true; break; + case 'm': + bpf_flags = MAPS_RELAX_COMPAT; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 238e734d75b3..28322ace2856 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -74,7 +74,7 @@ #define HELP_SPEC_PROGRAM \ "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }" #define HELP_SPEC_OPTIONS \ - "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} }" + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} | {-m|--mapcompat}" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE }" @@ -89,6 +89,7 @@ extern const char *bin_name; extern json_writer_t *json_wtr; extern bool json_output; extern bool show_pinned; +extern int bpf_flags; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; @@ -136,19 +137,23 @@ int do_map(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); +int do_net(int argc, char **arg); +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv); int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, - const char *arch); + const char *arch, const char *disassembler_options); void print_data_json(uint8_t *data, size_t len); void print_hex_data_json(uint8_t *data, size_t len); unsigned int get_page_size(void); unsigned int get_possible_cpus(void); -const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); +const char * +ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, + const char **opt); struct btf_dumper { const struct btf *btf; @@ -165,4 +170,11 @@ struct btf_dumper { */ int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, const void *data); + +struct nlattr; +struct ifinfomsg; +struct tcmsg; +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind, + const char *devname, int ifindex); #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b455930a3eaf..7bf38f0e152e 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -71,13 +72,16 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_XSKMAP] = "xskmap", [BPF_MAP_TYPE_SOCKHASH] = "sockhash", [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", }; static bool map_is_per_cpu(__u32 type) { return type == BPF_MAP_TYPE_PERCPU_HASH || type == BPF_MAP_TYPE_PERCPU_ARRAY || - type == BPF_MAP_TYPE_LRU_PERCPU_HASH; + type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE; } static bool map_is_map_of_maps(__u32 type) @@ -91,6 +95,17 @@ static bool map_is_map_of_progs(__u32 type) return type == BPF_MAP_TYPE_PROG_ARRAY; } +static int map_type_from_str(const char *type) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(map_type_name); i++) + /* Don't allow prefixing in case of possible future shadowing */ + if (map_type_name[i] && !strcmp(map_type_name[i], type)) + return i; + return -1; +} + static void *alloc_value(struct bpf_map_info *info) { if (map_is_per_cpu(info->type)) @@ -170,9 +185,28 @@ static int do_dump_btf(const struct btf_dumper *d, if (ret) goto err_end_obj; - jsonw_name(d->jw, "value"); + if (!map_is_per_cpu(map_info->type)) { + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, map_info->btf_value_type_id, value); + } else { + unsigned int i, n, step; - ret = btf_dumper_type(d, map_info->btf_value_type_id, value); + jsonw_name(d->jw, "values"); + jsonw_start_array(d->jw); + n = get_possible_cpus(); + step = round_up(map_info->value_size, 8); + for (i = 0; i < n; i++) { + jsonw_start_object(d->jw); + jsonw_int_field(d->jw, "cpu", i); + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, map_info->btf_value_type_id, + value + i * step); + jsonw_end_object(d->jw); + if (ret) + break; + } + jsonw_end_array(d->jw); + } err_end_obj: /* end of key-value pair */ @@ -299,11 +333,40 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key, jsonw_end_object(json_wtr); } jsonw_end_array(json_wtr); + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + jsonw_name(json_wtr, "formatted"); + do_dump_btf(&d, info, key, value); + } } jsonw_end_object(json_wtr); } +static void print_entry_error(struct bpf_map_info *info, unsigned char *key, + const char *value) +{ + int value_size = strlen(value); + bool single_line, break_names; + + break_names = info->key_size > 16 || value_size > 16; + single_line = info->key_size + value_size <= 24 && !break_names; + + printf("key:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, key, info->key_size, " "); + + printf(single_line ? " " : "\n"); + + printf("value:%c%s", break_names ? '\n' : ' ', value); + + printf("\n"); +} + static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, unsigned char *value) { @@ -626,6 +689,54 @@ static int do_show(int argc, char **argv) return errno == ENOENT ? 0 : -1; } +static int dump_map_elem(int fd, void *key, void *value, + struct bpf_map_info *map_info, struct btf *btf, + json_writer_t *btf_wtr) +{ + int num_elems = 0; + int lookup_errno; + + if (!bpf_map_lookup_elem(fd, key, value)) { + if (json_output) { + print_entry_json(map_info, key, value, btf); + } else { + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, map_info, key, value); + } else { + print_entry_plain(map_info, key, value); + } + num_elems++; + } + return num_elems; + } + + /* lookup error handling */ + lookup_errno = errno; + + if (map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) + return 0; + + if (json_output) { + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, map_info->key_size); + jsonw_name(json_wtr, "value"); + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); + jsonw_end_object(json_wtr); + } else { + print_entry_error(map_info, key, strerror(lookup_errno)); + } + + return 0; +} + static int do_dump(int argc, char **argv) { struct bpf_map_info info = {}; @@ -644,12 +755,6 @@ static int do_dump(int argc, char **argv) if (fd < 0) return -1; - if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) { - p_err("Dumping maps of maps and program maps not supported"); - close(fd); - return -1; - } - key = malloc(info.key_size); value = alloc_value(&info); if (!key || !value) { @@ -687,40 +792,8 @@ static int do_dump(int argc, char **argv) err = 0; break; } - - if (!bpf_map_lookup_elem(fd, key, value)) { - if (json_output) - print_entry_json(&info, key, value, btf); - else - if (btf) { - struct btf_dumper d = { - .btf = btf, - .jw = btf_wtr, - .is_plain_text = true, - }; - - do_dump_btf(&d, &info, key, value); - } else { - print_entry_plain(&info, key, value); - } - } else { - if (json_output) { - jsonw_name(json_wtr, "key"); - print_hex_data_json(key, info.key_size); - jsonw_name(json_wtr, "value"); - jsonw_start_object(json_wtr); - jsonw_string_field(json_wtr, "error", - "can't lookup element"); - jsonw_end_object(json_wtr); - } else { - p_info("can't lookup element with key: "); - fprint_hex(stderr, key, info.key_size, " "); - fprintf(stderr, "\n"); - } - } - + num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr); prev_key = key; - num_elems++; } if (json_output) @@ -997,6 +1070,92 @@ static int do_pin(int argc, char **argv) return err; } +static int do_create(int argc, char **argv) +{ + struct bpf_create_map_attr attr = { NULL, }; + const char *pinfile; + int err, fd; + + if (!REQ_ARGS(7)) + return -1; + pinfile = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "type")) { + NEXT_ARG(); + + if (attr.map_type) { + p_err("map type already specified"); + return -1; + } + + attr.map_type = map_type_from_str(*argv); + if ((int)attr.map_type < 0) { + p_err("unrecognized map type: %s", *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "name")) { + NEXT_ARG(); + attr.name = GET_ARG(); + } else if (is_prefix(*argv, "key")) { + if (parse_u32_arg(&argc, &argv, &attr.key_size, + "key size")) + return -1; + } else if (is_prefix(*argv, "value")) { + if (parse_u32_arg(&argc, &argv, &attr.value_size, + "value size")) + return -1; + } else if (is_prefix(*argv, "entries")) { + if (parse_u32_arg(&argc, &argv, &attr.max_entries, + "max entries")) + return -1; + } else if (is_prefix(*argv, "flags")) { + if (parse_u32_arg(&argc, &argv, &attr.map_flags, + "flags")) + return -1; + } else if (is_prefix(*argv, "dev")) { + NEXT_ARG(); + + if (attr.map_ifindex) { + p_err("offload device already specified"); + return -1; + } + + attr.map_ifindex = if_nametoindex(*argv); + if (!attr.map_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + return -1; + } + NEXT_ARG(); + } + } + + if (!attr.name) { + p_err("map name not specified"); + return -1; + } + + fd = bpf_create_map_xattr(&attr); + if (fd < 0) { + p_err("map create failed: %s", strerror(errno)); + return -1; + } + + err = do_pin_fd(fd, pinfile); + close(fd); + if (err) + return err; + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1006,6 +1165,9 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" + " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [dev NAME]\n" " %s %s dump MAP\n" " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" " %s %s lookup MAP key DATA\n" @@ -1020,11 +1182,17 @@ static int do_help(int argc, char **argv) " " HELP_SPEC_PROGRAM "\n" " VALUE := { DATA | MAP | PROG }\n" " UPDATE_FLAGS := { any | exist | noexist }\n" + " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n" + " percpu_array | stack_trace | cgroup_array | lru_hash |\n" + " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" + " devmap | sockmap | cpumap | xskmap | sockhash |\n" + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2]); return 0; } @@ -1040,6 +1208,7 @@ static const struct cmd cmds[] = { { "delete", do_delete }, { "pin", do_pin }, { "event_pipe", do_event_pipe }, + { "create", do_create }, { 0 } }; diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index 6d41323be291..bdaf4062e26e 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -50,15 +50,17 @@ static void int_exit(int signo) stop = true; } -static enum bpf_perf_event_ret print_bpf_output(void *event, void *priv) +static enum bpf_perf_event_ret +print_bpf_output(struct perf_event_header *event, void *private_data) { - struct event_ring_info *ring = priv; - struct perf_event_sample *e = event; + struct perf_event_sample *e = container_of(event, struct perf_event_sample, + header); + struct event_ring_info *ring = private_data; struct { struct perf_event_header header; __u64 id; __u64 lost; - } *lost = event; + } *lost = (typeof(lost))event; if (json_output) { jsonw_start_object(json_wtr); diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c new file mode 100644 index 000000000000..d441bb7035ca --- /dev/null +++ b/tools/bpf/bpftool/net.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "main.h" +#include "netlink_dumper.h" + +struct ip_devname_ifindex { + char devname[64]; + int ifindex; +}; + +struct bpf_netdev_t { + struct ip_devname_ifindex *devices; + int used_len; + int array_len; + int filter_idx; +}; + +struct tc_kind_handle { + char kind[64]; + int handle; +}; + +struct bpf_tcinfo_t { + struct tc_kind_handle *handle_array; + int used_len; + int array_len; + bool is_qdisc; +}; + +struct bpf_filter_t { + const char *kind; + const char *devname; + int ifindex; +}; + +static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_netdev_t *netinfo = cookie; + struct ifinfomsg *ifinfo = msg; + + if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index) + return 0; + + if (netinfo->used_len == netinfo->array_len) { + netinfo->devices = realloc(netinfo->devices, + (netinfo->array_len + 16) * + sizeof(struct ip_devname_ifindex)); + if (!netinfo->devices) + return -ENOMEM; + + netinfo->array_len += 16; + } + netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index; + snprintf(netinfo->devices[netinfo->used_len].devname, + sizeof(netinfo->devices[netinfo->used_len].devname), + "%s", + tb[IFLA_IFNAME] + ? libbpf_nla_getattr_str(tb[IFLA_IFNAME]) + : ""); + netinfo->used_len++; + + return do_xdp_dump(ifinfo, tb); +} + +static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_tcinfo_t *tcinfo = cookie; + struct tcmsg *info = msg; + + if (tcinfo->is_qdisc) { + /* skip clsact qdisc */ + if (tb[TCA_KIND] && + strcmp(libbpf_nla_data(tb[TCA_KIND]), "clsact") == 0) + return 0; + if (info->tcm_handle == 0) + return 0; + } + + if (tcinfo->used_len == tcinfo->array_len) { + tcinfo->handle_array = realloc(tcinfo->handle_array, + (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + if (!tcinfo->handle_array) + return -ENOMEM; + + tcinfo->array_len += 16; + } + tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; + snprintf(tcinfo->handle_array[tcinfo->used_len].kind, + sizeof(tcinfo->handle_array[tcinfo->used_len].kind), + "%s", + tb[TCA_KIND] + ? libbpf_nla_getattr_str(tb[TCA_KIND]) + : "unknown"); + tcinfo->used_len++; + + return 0; +} + +static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + const struct bpf_filter_t *filter_info = cookie; + + return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind, + filter_info->devname, filter_info->ifindex); +} + +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, + struct ip_devname_ifindex *dev) +{ + struct bpf_filter_t filter_info; + struct bpf_tcinfo_t tcinfo; + int i, handle, ret = 0; + + tcinfo.handle_array = NULL; + tcinfo.used_len = 0; + tcinfo.array_len = 0; + + tcinfo.is_qdisc = false; + ret = libbpf_nl_get_class(sock, nl_pid, dev->ifindex, + dump_class_qdisc_nlmsg, &tcinfo); + if (ret) + goto out; + + tcinfo.is_qdisc = true; + ret = libbpf_nl_get_qdisc(sock, nl_pid, dev->ifindex, + dump_class_qdisc_nlmsg, &tcinfo); + if (ret) + goto out; + + filter_info.devname = dev->devname; + filter_info.ifindex = dev->ifindex; + for (i = 0; i < tcinfo.used_len; i++) { + filter_info.kind = tcinfo.handle_array[i].kind; + ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, + tcinfo.handle_array[i].handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + } + + /* root, ingress and egress handle */ + handle = TC_H_ROOT; + filter_info.kind = "root"; + ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + filter_info.kind = "clsact/ingress"; + ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); + filter_info.kind = "clsact/egress"; + ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + +out: + free(tcinfo.handle_array); + return 0; +} + +static int do_show(int argc, char **argv) +{ + int i, sock, ret, filter_idx = -1; + struct bpf_netdev_t dev_array; + unsigned int nl_pid; + char err_buf[256]; + + if (argc == 2) { + if (strcmp(argv[0], "dev") != 0) + usage(); + filter_idx = if_nametoindex(argv[1]); + if (filter_idx == 0) { + fprintf(stderr, "invalid dev name %s\n", argv[1]); + return -1; + } + } else if (argc != 0) { + usage(); + } + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) { + fprintf(stderr, "failed to open netlink sock\n"); + return -1; + } + + dev_array.devices = NULL; + dev_array.used_len = 0; + dev_array.array_len = 0; + dev_array.filter_idx = filter_idx; + + if (json_output) + jsonw_start_array(json_wtr); + NET_START_OBJECT; + NET_START_ARRAY("xdp", "%s:\n"); + ret = libbpf_nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); + NET_END_ARRAY("\n"); + + if (!ret) { + NET_START_ARRAY("tc", "%s:\n"); + for (i = 0; i < dev_array.used_len; i++) { + ret = show_dev_tc_bpf(sock, nl_pid, + &dev_array.devices[i]); + if (ret) + break; + } + NET_END_ARRAY("\n"); + } + NET_END_OBJECT; + if (json_output) + jsonw_end_array(json_wtr); + + if (ret) { + if (json_output) + jsonw_null(json_wtr); + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "Error: %s\n", err_buf); + } + free(dev_array.devices); + close(sock); + return ret; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s %s { show | list } [dev ]\n" + " %s %s help\n" + "Note: Only xdp and tc attachments are supported now.\n" + " For progs attached to cgroups, use \"bpftool cgroup\"\n" + " to dump program attachments. For program types\n" + " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" + " consult iproute2.\n", + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_net(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c new file mode 100644 index 000000000000..4e9f4531269f --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#include +#include +#include +#include +#include + +#include +#include "main.h" +#include "netlink_dumper.h" + +static void xdp_dump_prog_id(struct nlattr **tb, int attr, + const char *mode, + bool new_json_object) +{ + if (!tb[attr]) + return; + + if (new_json_object) + NET_START_OBJECT + NET_DUMP_STR("mode", " %s", mode); + NET_DUMP_UINT("id", " id %u", libbpf_nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_END_OBJECT +} + +static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, + const char *name) +{ + struct nlattr *tb[IFLA_XDP_MAX + 1]; + unsigned char mode; + + if (libbpf_nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0) + return -1; + + if (!tb[IFLA_XDP_ATTACHED]) + return 0; + + mode = libbpf_nla_getattr_u8(tb[IFLA_XDP_ATTACHED]); + if (mode == XDP_ATTACHED_NONE) + return 0; + + NET_START_OBJECT; + if (name) + NET_DUMP_STR("devname", "%s", name); + NET_DUMP_UINT("ifindex", "(%d)", ifindex); + + if (mode == XDP_ATTACHED_MULTI) { + if (json_output) { + jsonw_name(json_wtr, "multi_attachments"); + jsonw_start_array(json_wtr); + } + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true); + if (json_output) + jsonw_end_array(json_wtr); + } else if (mode == XDP_ATTACHED_DRV) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false); + } else if (mode == XDP_ATTACHED_SKB) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false); + } else if (mode == XDP_ATTACHED_HW) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false); + } + + NET_END_OBJECT_FINAL; + return 0; +} + +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) +{ + if (!tb[IFLA_XDP]) + return 0; + + return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index, + libbpf_nla_getattr_str(tb[IFLA_IFNAME])); +} + +static int do_bpf_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (!tb[TCA_ACT_BPF_PARMS]) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_OBJECT_NESTED2; + if (tb[TCA_ACT_BPF_NAME]) + NET_DUMP_STR("name", "%s", + libbpf_nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + if (tb[TCA_ACT_BPF_ID]) + NET_DUMP_UINT("id", " id %u", + libbpf_nla_getattr_u32(tb[TCA_ACT_BPF_ID])); + NET_END_OBJECT_NESTED; + return 0; +} + +static int do_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX + 1]; + + if (!attr) + return 0; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_ACT_KIND] && + strcmp(libbpf_nla_data(tb[TCA_ACT_KIND]), "bpf") == 0) + return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]); + + return 0; +} + +static int do_bpf_act_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + int act, ret; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_ARRAY("act", " %s ["); + for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { + ret = do_dump_one_act(tb[act]); + if (ret) + break; + } + NET_END_ARRAY("] "); + + return ret; +} + +static int do_bpf_filter_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_BPF_MAX + 1]; + int ret; + + if (libbpf_nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_BPF_NAME]) + NET_DUMP_STR("name", " %s", + libbpf_nla_getattr_str(tb[TCA_BPF_NAME])); + if (tb[TCA_BPF_ID]) + NET_DUMP_UINT("id", " id %u", + libbpf_nla_getattr_u32(tb[TCA_BPF_ID])); + if (tb[TCA_BPF_ACT]) { + ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); + if (ret) + return ret; + } + + return 0; +} + +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind, + const char *devname, int ifindex) +{ + int ret = 0; + + if (tb[TCA_OPTIONS] && + strcmp(libbpf_nla_data(tb[TCA_KIND]), "bpf") == 0) { + NET_START_OBJECT; + if (devname[0] != '\0') + NET_DUMP_STR("devname", "%s", devname); + NET_DUMP_UINT("ifindex", "(%u)", ifindex); + NET_DUMP_STR("kind", " %s", kind); + ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); + NET_END_OBJECT_FINAL; + } + + return ret; +} diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h new file mode 100644 index 000000000000..e3516b586a34 --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#ifndef _NETLINK_DUMPER_H_ +#define _NETLINK_DUMPER_H_ + +#define NET_START_OBJECT \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ +} + +#define NET_START_OBJECT_NESTED(name) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_object(json_wtr); \ + } else { \ + fprintf(stdout, "%s {", name); \ + } \ +} + +#define NET_START_OBJECT_NESTED2 \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ + else \ + fprintf(stdout, "{"); \ +} + +#define NET_END_OBJECT_NESTED \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stdout, "}"); \ +} + +#define NET_END_OBJECT \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ +} + +#define NET_END_OBJECT_FINAL \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stdout, "\n"); \ +} + +#define NET_START_ARRAY(name, fmt_str) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_array(json_wtr); \ + } else { \ + fprintf(stdout, fmt_str, name); \ + } \ +} + +#define NET_END_ARRAY(endstr) \ +{ \ + if (json_output) \ + jsonw_end_array(json_wtr); \ + else \ + fprintf(stdout, "%s", endstr); \ +} + +#define NET_DUMP_UINT(name, fmt_str, val) \ +{ \ + if (json_output) \ + jsonw_uint_field(json_wtr, name, val); \ + else \ + fprintf(stdout, fmt_str, val); \ +} + +#define NET_DUMP_STR(name, fmt_str, str) \ +{ \ + if (json_output) \ + jsonw_string_field(json_wtr, name, str);\ + else \ + fprintf(stdout, fmt_str, str); \ +} + +#define NET_DUMP_STR_ONLY(str) \ +{ \ + if (json_output) \ + jsonw_string(json_wtr, str); \ + else \ + fprintf(stdout, "%s ", str); \ +} + +#endif diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index dce960d22106..5302ee282409 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -74,8 +74,29 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", }; +static const char * const attach_type_strings[] = { + [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_MSG_VERDICT] = "msg_verdict", + [__MAX_BPF_ATTACH_TYPE] = NULL, +}; + +enum bpf_attach_type parse_attach_type(const char *str) +{ + enum bpf_attach_type type; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + if (attach_type_strings[type] && + is_prefix(str, attach_type_strings[type])) + return type; + } + + return __MAX_BPF_ATTACH_TYPE; +} + static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) { struct timespec real_time_ts, boot_time_ts; @@ -428,6 +449,7 @@ static int do_dump(int argc, char **argv) unsigned long *func_ksyms = NULL; struct bpf_prog_info info = {}; unsigned int *func_lens = NULL; + const char *disasm_opt = NULL; unsigned int nr_func_ksyms; unsigned int nr_func_lens; struct dump_data dd = {}; @@ -586,9 +608,10 @@ static int do_dump(int argc, char **argv) const char *name = NULL; if (info.ifindex) { - name = ifindex_to_bfd_name_ns(info.ifindex, - info.netns_dev, - info.netns_ino); + name = ifindex_to_bfd_params(info.ifindex, + info.netns_dev, + info.netns_ino, + &disasm_opt); if (!name) goto err_free; } @@ -630,7 +653,8 @@ static int do_dump(int argc, char **argv) printf("%s:\n", sym_name); } - disasm_print_insn(img, lens[i], opcodes, name); + disasm_print_insn(img, lens[i], opcodes, name, + disasm_opt); img += lens[i]; if (json_output) @@ -642,7 +666,8 @@ static int do_dump(int argc, char **argv) if (json_output) jsonw_end_array(json_wtr); } else { - disasm_print_insn(buf, *member_len, opcodes, name); + disasm_print_insn(buf, *member_len, opcodes, name, + disasm_opt); } } else if (visual) { if (json_output) @@ -696,6 +721,77 @@ int map_replace_compar(const void *p1, const void *p2) return a->idx - b->idx; } +static int do_attach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map attach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_attach(progfd, mapfd, attach_type, 0); + if (err) { + p_err("failed prog attach to map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + +static int do_detach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map detach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_detach2(progfd, mapfd, attach_type); + if (err) { + p_err("failed prog detach from map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} static int do_load(int argc, char **argv) { enum bpf_attach_type expected_attach_type; @@ -816,7 +912,7 @@ static int do_load(int argc, char **argv) } } - obj = bpf_object__open_xattr(&attr); + obj = __bpf_object__open_xattr(&attr, bpf_flags); if (IS_ERR_OR_NULL(obj)) { p_err("failed to open object file"); goto err_free_reuse_maps; @@ -941,6 +1037,8 @@ static int do_help(int argc, char **argv) " %s %s pin PROG FILE\n" " %s %s load OBJ FILE [type TYPE] [dev NAME] \\\n" " [map { idx IDX | name NAME } MAP]\n" + " %s %s attach PROG ATTACH_TYPE MAP\n" + " %s %s detach PROG ATTACH_TYPE MAP\n" " %s %s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -952,10 +1050,12 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" + " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -967,6 +1067,8 @@ static const struct cmd cmds[] = { { "dump", do_dump }, { "pin", do_pin }, { "load", do_load }, + { "attach", do_attach }, + { "detach", do_detach }, { 0 } }; diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c new file mode 100644 index 000000000000..24115173a483 --- /dev/null +++ b/tools/crypto/getstat.c @@ -0,0 +1,294 @@ +/* Heavily copied from libkcapi 2015 - 2017, Stephan Mueller */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CR_RTA(x) ((struct rtattr *)(((char *)(x)) + NLMSG_ALIGN(sizeof(struct crypto_user_alg)))) + +static int get_stat(const char *drivername) +{ + struct { + struct nlmsghdr n; + struct crypto_user_alg cru; + } req; + struct sockaddr_nl nl; + int sd = 0, ret; + socklen_t addr_len; + struct iovec iov; + struct msghdr msg; + char buf[4096]; + struct nlmsghdr *res_n = (struct nlmsghdr *)buf; + struct crypto_user_alg *cru_res = NULL; + int res_len = 0; + struct rtattr *tb[CRYPTOCFGA_MAX + 1]; + struct rtattr *rta; + struct nlmsgerr *errmsg; + + memset(&req, 0, sizeof(req)); + memset(&buf, 0, sizeof(buf)); + memset(&msg, 0, sizeof(msg)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.cru)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = CRYPTO_MSG_GETSTAT; + req.n.nlmsg_seq = time(NULL); + + strncpy(req.cru.cru_driver_name, drivername, strlen(drivername)); + + sd = socket(AF_NETLINK, SOCK_RAW, NETLINK_CRYPTO); + if (sd < 0) { + fprintf(stderr, "Netlink error: cannot open netlink socket"); + return -errno; + } + memset(&nl, 0, sizeof(nl)); + nl.nl_family = AF_NETLINK; + if (bind(sd, (struct sockaddr *)&nl, sizeof(nl)) < 0) { + ret = -errno; + fprintf(stderr, "Netlink error: cannot bind netlink socket"); + goto out; + } + + /* sanity check that netlink socket was successfully opened */ + addr_len = sizeof(nl); + if (getsockname(sd, (struct sockaddr *)&nl, &addr_len) < 0) { + ret = -errno; + printf("Netlink error: cannot getsockname"); + goto out; + } + if (addr_len != sizeof(nl)) { + ret = -errno; + printf("Netlink error: wrong address length %d", addr_len); + goto out; + } + if (nl.nl_family != AF_NETLINK) { + ret = -errno; + printf("Netlink error: wrong address family %d", + nl.nl_family); + goto out; + } + + memset(&nl, 0, sizeof(nl)); + nl.nl_family = AF_NETLINK; + iov.iov_base = (void *)&req.n; + iov.iov_len = req.n.nlmsg_len; + msg.msg_name = &nl; + msg.msg_namelen = sizeof(nl); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + if (sendmsg(sd, &msg, 0) < 0) { + ret = -errno; + printf("Netlink error: sendmsg failed"); + goto out; + } + memset(buf, 0, sizeof(buf)); + iov.iov_base = buf; + while (1) { + iov.iov_len = sizeof(buf); + ret = recvmsg(sd, &msg, 0); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + ret = -errno; + printf("Netlink error: netlink receive error"); + goto out; + } + if (ret == 0) { + ret = -errno; + printf("Netlink error: no data"); + goto out; + } + if (ret > sizeof(buf)) { + ret = -errno; + printf("Netlink error: received too much data"); + goto out; + } + break; + } + + ret = -EFAULT; + res_len = res_n->nlmsg_len; + if (res_n->nlmsg_type == NLMSG_ERROR) { + errmsg = NLMSG_DATA(res_n); + fprintf(stderr, "Fail with %d\n", errmsg->error); + ret = errmsg->error; + goto out; + } + + if (res_n->nlmsg_type == CRYPTO_MSG_GETSTAT) { + cru_res = NLMSG_DATA(res_n); + res_len -= NLMSG_SPACE(sizeof(*cru_res)); + } + if (res_len < 0) { + printf("Netlink error: nlmsg len %d\n", res_len); + goto out; + } + + if (!cru_res) { + ret = -EFAULT; + printf("Netlink error: no cru_res\n"); + goto out; + } + + rta = CR_RTA(cru_res); + memset(tb, 0, sizeof(struct rtattr *) * (CRYPTOCFGA_MAX + 1)); + while (RTA_OK(rta, res_len)) { + if ((rta->rta_type <= CRYPTOCFGA_MAX) && (!tb[rta->rta_type])) + tb[rta->rta_type] = rta; + rta = RTA_NEXT(rta, res_len); + } + if (res_len) { + printf("Netlink error: unprocessed data %d", + res_len); + goto out; + } + + if (tb[CRYPTOCFGA_STAT_HASH]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_HASH]; + struct crypto_stat *rhash = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tHash\n\tHash: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rhash->stat_hash_cnt, rhash->stat_hash_tlen, + rhash->stat_hash_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_COMPRESS]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS]; + struct crypto_stat *rblk = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tCompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rblk->stat_compress_cnt, rblk->stat_compress_tlen, + rblk->stat_decompress_cnt, rblk->stat_decompress_tlen, + rblk->stat_compress_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_ACOMP]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP]; + struct crypto_stat *rcomp = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tACompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rcomp->stat_compress_cnt, rcomp->stat_compress_tlen, + rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen, + rcomp->stat_compress_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_AEAD]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD]; + struct crypto_stat *raead = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tAEAD\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + drivername, + raead->stat_encrypt_cnt, raead->stat_encrypt_tlen, + raead->stat_decrypt_cnt, raead->stat_decrypt_tlen, + raead->stat_aead_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER]; + struct crypto_stat *rblk = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tCipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, + rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, + rblk->stat_cipher_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER]; + struct crypto_stat *rblk = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tAkcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tSign: %u\n\tVerify: %u\n\tErrors: %u\n", + drivername, + rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, + rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, + rblk->stat_sign_cnt, rblk->stat_verify_cnt, + rblk->stat_akcipher_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_CIPHER]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER]; + struct crypto_stat *rblk = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, + rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, + rblk->stat_cipher_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_RNG]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG]; + struct crypto_stat *rrng = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tRNG\n\tSeed: %u\n\tGenerate: %u bytes: %llu\n\tErrors: %u\n", + drivername, + rrng->stat_seed_cnt, + rrng->stat_generate_cnt, rrng->stat_generate_tlen, + rrng->stat_rng_err_cnt); + } else if (tb[CRYPTOCFGA_STAT_KPP]) { + struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP]; + struct crypto_stat *rkpp = + (struct crypto_stat *)RTA_DATA(rta); + printf("%s\tKPP\n\tSetsecret: %u\n\tGenerate public key: %u\n\tCompute_shared_secret: %u\n\tErrors: %u\n", + drivername, + rkpp->stat_setsecret_cnt, + rkpp->stat_generate_public_key_cnt, + rkpp->stat_compute_shared_secret_cnt, + rkpp->stat_kpp_err_cnt); + } else { + fprintf(stderr, "%s is of an unknown algorithm\n", drivername); + } + ret = 0; +out: + close(sd); + return ret; +} + +int main(int argc, const char *argv[]) +{ + char buf[4096]; + FILE *procfd; + int i, lastspace; + int ret; + + procfd = fopen("/proc/crypto", "r"); + if (!procfd) { + ret = errno; + fprintf(stderr, "Cannot open /proc/crypto %s\n", strerror(errno)); + return ret; + } + if (argc > 1) { + if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) { + printf("Usage: %s [-h|--help] display this help\n", argv[0]); + printf("Usage: %s display all crypto statistics\n", argv[0]); + printf("Usage: %s drivername1 drivername2 ... = display crypto statistics about drivername1 ...\n", argv[0]); + return 0; + } + for (i = 1; i < argc; i++) { + ret = get_stat(argv[i]); + if (ret) { + fprintf(stderr, "Failed with %s\n", strerror(-ret)); + return ret; + } + } + return 0; + } + + while (fgets(buf, sizeof(buf), procfd)) { + if (!strncmp(buf, "driver", 6)) { + lastspace = 0; + i = 0; + while (i < strlen(buf)) { + i++; + if (buf[i] == ' ') + lastspace = i; + } + buf[strlen(buf) - 1] = '\0'; + ret = get_stat(buf + lastspace + 1); + if (ret) { + fprintf(stderr, "Failed with %s\n", strerror(-ret)); + goto out; + } + } + } +out: + fclose(procfd); + return ret; +} diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c index d78aed86af09..8ff8cb1a11f4 100644 --- a/tools/hv/hv_fcopy_daemon.c +++ b/tools/hv/hv_fcopy_daemon.c @@ -234,6 +234,7 @@ int main(int argc, char *argv[]) break; default: + error = HV_E_FAIL; syslog(LOG_ERR, "Unknown operation: %d", buffer.hdr.operation); diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 9bce3b56b5e7..5d2ab38965cc 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -27,5 +27,6 @@ #include #include +#include #endif /* __TOOLS_ASM_GENERIC_BITOPS_H */ diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h index 21c41ccd1266..2f6ea28764a7 100644 --- a/tools/include/asm-generic/bitops/atomic.h +++ b/tools/include/asm-generic/bitops/atomic.h @@ -15,13 +15,4 @@ static inline void clear_bit(int nr, unsigned long *addr) addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); } -static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) -{ - return ((1UL << (nr % __BITS_PER_LONG)) & - (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0; -} - -#define __set_bit(nr, addr) set_bit(nr, addr) -#define __clear_bit(nr, addr) clear_bit(nr, addr) - #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */ diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h new file mode 100644 index 000000000000..7e10c4b50c5d --- /dev/null +++ b/tools/include/asm-generic/bitops/non-atomic.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ +#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ + +#include + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p |= mask; +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p &= ~mask; +} + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p ^= mask; +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old | mask; + return (old & mask) != 0; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old & ~mask; + return (old & mask) != 0; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, + volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old ^ mask; + return (old & mask) != 0; +} + +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h index 391d942536e5..8d378c57cb01 100644 --- a/tools/include/asm/barrier.h +++ b/tools/include/asm/barrier.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #if defined(__i386__) || defined(__x86_64__) #include "../../arch/x86/include/asm/barrier.h" #elif defined(__arm__) @@ -26,3 +27,37 @@ #else #include #endif + +/* + * Generic fallback smp_*() definitions for archs that haven't + * been updated yet. + */ + +#ifndef smp_rmb +# define smp_rmb() rmb() +#endif + +#ifndef smp_wmb +# define smp_wmb() wmb() +#endif + +#ifndef smp_mb +# define smp_mb() mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + smp_mb(); \ + ___p1; \ +}) +#endif diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index e63662db131b..05dca5c203f3 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -15,6 +15,7 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); +void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index acc704bd3998..0b0ef3abc966 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -3,8 +3,6 @@ #define _TOOLS_LINUX_BITOPS_H_ #include -#include - #ifndef __WORDSIZE #define __WORDSIZE (__SIZEOF_LONG__ * 8) #endif @@ -12,10 +10,9 @@ #ifndef BITS_PER_LONG # define BITS_PER_LONG __WORDSIZE #endif +#include +#include -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32)) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h new file mode 100644 index 000000000000..2b7b532c1d51 --- /dev/null +++ b/tools/include/linux/bits.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_BITS_H +#define __LINUX_BITS_H +#include + +#define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) +#define BITS_PER_BYTE 8 + +/* + * Create a contiguous bitmask starting at bit position @l and ending at + * position @h. For example + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. + */ +#define GENMASK(h, l) \ + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) + +#define GENMASK_ULL(h, l) \ + (((~0ULL) - (1ULL << (l)) + 1) & \ + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + +#endif /* __LINUX_BITS_H */ diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h index 7a8b61ad44cb..094649667bae 100644 --- a/tools/include/linux/err.h +++ b/tools/include/linux/err.h @@ -52,4 +52,11 @@ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } +static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + else + return 0; +} #endif /* _LINUX_ERR_H */ diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 0ad884452c5c..6935ef94e77a 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -70,6 +70,7 @@ #define BUG_ON(cond) assert(!(cond)) #endif #endif +#define BUG() BUG_ON(1) #if __BYTE_ORDER == __BIG_ENDIAN #define cpu_to_le16 bswap_16 diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h new file mode 100644 index 000000000000..9a083ae60473 --- /dev/null +++ b/tools/include/linux/ring_buffer.h @@ -0,0 +1,73 @@ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +/* + * Contract with kernel for walking the perf ring buffer from + * user space requires the following barrier pairing (quote + * from kernel/events/ring_buffer.c): + * + * Since the mmap() consumer (userspace) can run on a + * different CPU: + * + * kernel user + * + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } + * + * Where A pairs with D, and B pairs with C. + * + * In our case A is a control dependency that separates the + * load of the ->data_tail and the stores of $data. In case + * ->data_tail indicates there is no room in the buffer to + * store $data we do not. + * + * D needs to be a full barrier since it separates the data + * READ from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, + * and for C an RMB is sufficient since it separates two READs. + * + * Note, instead of B, C, D we could also use smp_store_release() + * in B and D as well as smp_load_acquire() in C. + * + * However, this optimization does not make sense for all kernel + * supported architectures since for a fair number it would + * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(), + * and smp_mb() + WRITE_ONCE() pair for smp_store_release(). + * + * Thus for those smp_wmb() in B and smp_rmb() in C would still + * be less expensive. For the case of D this has either the same + * cost or is less expensive, for example, due to TSO x86 can + * avoid the CPU barrier entirely. + */ + +static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ +/* + * Architectures where smp_load_acquire() does not fallback to + * READ_ONCE() + smp_mb() pair. + */ +#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \ + defined(__ia64__) || defined(__sparc__) && defined(__arch64__) + return smp_load_acquire(&base->data_head); +#else + u64 head = READ_ONCE(base->data_head); + + smp_rmb(); + return head; +#endif +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 1738c0391da4..c934572d935c 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -8,8 +8,14 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER #define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER -#define spin_lock_init(x) pthread_mutex_init(x, NULL) - +#define spin_lock_init(x) pthread_mutex_init(x, NULL) + +#define spin_lock(x) pthread_mutex_lock(x) +#define spin_unlock(x) pthread_mutex_unlock(x) +#define spin_lock_bh(x) pthread_mutex_lock(x) +#define spin_unlock_bh(x) pthread_mutex_unlock(x) +#define spin_lock_irq(x) pthread_mutex_lock(x) +#define spin_unlock_irq(x) pthread_mutex_unlock(x) #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) @@ -31,4 +37,6 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) return true; } +#include + #endif diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index df4bedb9b01c..538546edbfbd 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -242,10 +242,12 @@ __SYSCALL(__NR_tee, sys_tee) /* fs/stat.c */ #define __NR_readlinkat 78 __SYSCALL(__NR_readlinkat, sys_readlinkat) +#if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64) #define __NR3264_fstatat 79 __SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat) #define __NR3264_fstat 80 __SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat) +#endif /* fs/sync.c */ #define __NR_sync 81 diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 66917a4eba27..852dc17ab47a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { @@ -127,6 +128,9 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -152,6 +156,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +177,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -459,6 +465,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1430,7 +1458,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -2141,6 +2169,94 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2226,7 +2342,14 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2333,6 +2456,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2395,6 +2519,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. @@ -2778,4 +2919,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h new file mode 100644 index 000000000000..a441ea1bfe6d --- /dev/null +++ b/tools/include/uapi/linux/fs.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include +#include +#include + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1< + +/* TLS socket options */ +#define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ + +/* Supported versions */ +#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) +#define TLS_VERSION_MAJOR(ver) (((ver) >> 8) & 0xFF) + +#define TLS_VERSION_NUMBER(id) ((((id##_VERSION_MAJOR) & 0xFF) << 8) | \ + ((id##_VERSION_MINOR) & 0xFF)) + +#define TLS_1_2_VERSION_MAJOR 0x3 +#define TLS_1_2_VERSION_MINOR 0x3 +#define TLS_1_2_VERSION TLS_VERSION_NUMBER(TLS_1_2) + +/* Supported ciphers */ +#define TLS_CIPHER_AES_GCM_128 51 +#define TLS_CIPHER_AES_GCM_128_IV_SIZE 8 +#define TLS_CIPHER_AES_GCM_128_KEY_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_SALT_SIZE 4 +#define TLS_CIPHER_AES_GCM_128_TAG_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 + +#define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 + +struct tls_crypto_info { + __u16 version; + __u16 cipher_type; +}; + +struct tls12_crypto_info_aes_gcm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE]; +}; + +#endif /* _UAPI_LINUX_TLS_H */ diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index ed0a120d4f08..404d4b9ffe76 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -752,7 +752,7 @@ struct snd_timer_info { #define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ struct snd_timer_params { - unsigned int flags; /* flags - SNDRV_MIXER_PSFLG_* */ + unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ unsigned int ticks; /* requested resolution in ticks */ unsigned int queue_size; /* total size of queue (32-1024) */ unsigned int reserved0; /* reserved, was: failure locations */ diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 439b8a27488d..195ba486640f 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1325,7 +1325,7 @@ class Tui(object): msg = '' while True: self.screen.erase() - self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' % + self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' % DELAY_DEFAULT, curses.A_BOLD) self.screen.addstr(4, 0, msg) self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c index 120037496f77..5afb11b30fca 100644 --- a/tools/lib/api/fs/tracing_path.c +++ b/tools/lib/api/fs/tracing_path.c @@ -36,7 +36,7 @@ static const char *tracing_path_tracefs_mount(void) __tracing_path_set("", mnt); - return mnt; + return tracing_path; } static const char *tracing_path_debugfs_mount(void) @@ -49,7 +49,7 @@ static const char *tracing_path_debugfs_mount(void) __tracing_path_set("tracing/", mnt); - return mnt; + return tracing_path; } const char *tracing_path_mount(void) diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 6eb9bacd1948..7bc31c905018 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index d49902e818b5..425b480bda75 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) # Most of this file is copied from tools/lib/traceevent/Makefile BPF_VERSION = 0 @@ -69,7 +69,7 @@ FEATURE_USER = .libbpf FEATURE_TESTS = libelf libelf-mmap bpf reallocarray FEATURE_DISPLAY = libelf bpf -INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi -I$(srctree)/tools/perf +INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES) check_feat := 1 @@ -125,6 +125,7 @@ override CFLAGS += $(EXTRA_WARNINGS) override CFLAGS += -Werror -Wall override CFLAGS += -fPIC override CFLAGS += $(INCLUDES) +override CFLAGS += -fvisibility=hidden ifeq ($(VERBOSE),1) Q = diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 60aa4ca8b2c5..03f9bcc4ef50 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * common eBPF ELF operations. @@ -28,16 +28,8 @@ #include #include "bpf.h" #include "libbpf.h" -#include "nlattr.h" -#include -#include -#include #include -#ifndef SOL_NETLINK -#define SOL_NETLINK 270 -#endif - /* * When building perf, unistd.h is overridden. __NR_bpf is * required to be defined explicitly. @@ -286,6 +278,18 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value) return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } +int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) +{ + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); +} + int bpf_map_delete_elem(int fd, const void *key) { union bpf_attr attr; @@ -499,127 +503,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); } -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - socklen_t addrlen; - int one = 1; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - return -errno; - } - - if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, - &one, sizeof(one)) < 0) { - fprintf(stderr, "Netlink error reporting not supported\n"); - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - ret = -errno; - goto cleanup; - } - - addrlen = sizeof(sa); - if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { - ret = -errno; - goto cleanup; - } - - if (addrlen != sizeof(sa)) { - ret = -LIBBPF_ERRNO__INTERNAL; - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | IFLA_XDP; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FD; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FLAGS; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - ret = -errno; - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - ret = -errno; - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != sa.nl_pid) { - ret = -LIBBPF_ERRNO__WRNGPID; - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - ret = -LIBBPF_ERRNO__INVSEQ; - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - ret = err->error; - nla_dump_errormsg(nh); - goto cleanup; - case NLMSG_DONE: - break; - default: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} - int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) { diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 6f38164b2618..26a51538213c 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: LGPL-2.1 */ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* * common eBPF ELF operations. @@ -20,13 +20,17 @@ * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see */ -#ifndef __BPF_BPF_H -#define __BPF_BPF_H +#ifndef __LIBBPF_BPF_H +#define __LIBBPF_BPF_H #include #include #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + struct bpf_create_map_attr { const char *name; enum bpf_map_type map_type; @@ -42,21 +46,24 @@ struct bpf_create_map_attr { __u32 inner_map_fd; }; -int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); -int bpf_create_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags, int node); -int bpf_create_map_name(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags); -int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, - int max_entries, __u32 map_flags); -int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags, int node); -int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags); +LIBBPF_API int +bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); +LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, __u32 map_flags, int node); +LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, __u32 map_flags); +LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries, __u32 map_flags); +LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type, + const char *name, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags, int node); +LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type, + const char *name, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags); struct bpf_load_program_attr { enum bpf_prog_type prog_type; @@ -69,46 +76,56 @@ struct bpf_load_program_attr { __u32 prog_ifindex; }; +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE (256 * 1024) -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz); -int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, int strict_alignment, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, int log_level); +LIBBPF_API int +bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz); +LIBBPF_API int bpf_load_program(enum bpf_prog_type type, + const struct bpf_insn *insns, size_t insns_cnt, + const char *license, __u32 kern_version, + char *log_buf, size_t log_buf_sz); +LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, + const struct bpf_insn *insns, + size_t insns_cnt, int strict_alignment, + const char *license, __u32 kern_version, + char *log_buf, size_t log_buf_sz, + int log_level); -int bpf_map_update_elem(int fd, const void *key, const void *value, - __u64 flags); +LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); -int bpf_map_lookup_elem(int fd, const void *key, void *value); -int bpf_map_delete_elem(int fd, const void *key); -int bpf_map_get_next_key(int fd, const void *key, void *next_key); -int bpf_obj_pin(int fd, const char *pathname); -int bpf_obj_get(const char *pathname); -int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, - unsigned int flags); -int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); -int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration); -int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); -int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); -int bpf_prog_get_fd_by_id(__u32 id); -int bpf_map_get_fd_by_id(__u32 id); -int bpf_btf_get_fd_by_id(__u32 id); -int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); -int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, - __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); -int bpf_raw_tracepoint_open(const char *name, int prog_fd); -int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, - bool do_log); -int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, - __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, - __u64 *probe_addr); -#endif +LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, + void *value); +LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, + enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, + enum bpf_attach_type type); +LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, + __u32 size, void *data_out, __u32 *size_out, + __u32 *retval, __u32 *duration); +LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); +LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, + __u32 query_flags, __u32 *attach_flags, + __u32 *prog_ids, __u32 *prog_cnt); +LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); +LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, + __u32 log_buf_size, bool do_log); +LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, + __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr); +#endif /* __LIBBPF_BPF_H */ diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index cf94b0770522..449591aa9900 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2018 Facebook */ #include diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 4897e0724d4e..b77e7080f7e7 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -1,11 +1,15 @@ -/* SPDX-License-Identifier: LGPL-2.1 */ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* Copyright (c) 2018 Facebook */ -#ifndef __BPF_BTF_H -#define __BPF_BTF_H +#ifndef __LIBBPF_BTF_H +#define __LIBBPF_BTF_H #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + #define BTF_ELF_SEC ".BTF" struct btf; @@ -14,13 +18,15 @@ struct btf_type; typedef int (*btf_print_fn_t)(const char *, ...) __attribute__((format(printf, 1, 2))); -void btf__free(struct btf *btf); -struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log); -__s32 btf__find_by_name(const struct btf *btf, const char *type_name); -const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); -__s64 btf__resolve_size(const struct btf *btf, __u32 type_id); -int btf__resolve_type(const struct btf *btf, __u32 type_id); -int btf__fd(const struct btf *btf); -const char *btf__name_by_offset(const struct btf *btf, __u32 offset); +LIBBPF_API void btf__free(struct btf *btf); +LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log); +LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, + const char *type_name); +LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, + __u32 id); +LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); -#endif +#endif /* __LIBBPF_BTF_H */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index bdb94939fd60..d6e62e90e8d4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * Common eBPF ELF object loading operations. @@ -7,19 +7,6 @@ * Copyright (C) 2015 Wang Nan * Copyright (C) 2015 Huawei Inc. * Copyright (C) 2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License (not later!) - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see */ #define _GNU_SOURCE @@ -32,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -40,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -182,7 +170,7 @@ static LIST_HEAD(bpf_objects_list); struct bpf_object { char license[64]; - u32 kern_version; + __u32 kern_version; struct bpf_program *programs; size_t nr_programs; @@ -228,7 +216,7 @@ struct bpf_object { }; #define obj_elf_valid(o) ((o)->efile.elf) -static void bpf_program__unload(struct bpf_program *prog) +void bpf_program__unload(struct bpf_program *prog) { int i; @@ -470,7 +458,8 @@ static int bpf_object__elf_init(struct bpf_object *obj) obj->efile.fd = open(obj->path, O_RDONLY); if (obj->efile.fd < 0) { char errmsg[STRERR_BUFSIZE]; - char *cp = str_error(errno, errmsg, sizeof(errmsg)); + char *cp = libbpf_strerror_r(errno, errmsg, + sizeof(errmsg)); pr_warning("failed to open %s: %s\n", obj->path, cp); return -errno; @@ -552,7 +541,7 @@ static int bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) { - u32 kver; + __u32 kver; if (size != sizeof(kver)) { pr_warning("invalid kver section in %s\n", obj->path); @@ -574,8 +563,9 @@ static int compare_bpf_map(const void *_a, const void *_b) } static int -bpf_object__init_maps(struct bpf_object *obj) +bpf_object__init_maps(struct bpf_object *obj, int flags) { + bool strict = !(flags & MAPS_RELAX_COMPAT); int i, map_idx, map_def_sz, nr_maps = 0; Elf_Scn *scn; Elf_Data *data; @@ -697,7 +687,8 @@ bpf_object__init_maps(struct bpf_object *obj) "has unrecognized, non-zero " "options\n", obj->path, map_name); - return -EINVAL; + if (strict) + return -EINVAL; } } memcpy(&obj->maps[map_idx].def, def, @@ -728,7 +719,7 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx) return false; } -static int bpf_object__elf_collect(struct bpf_object *obj) +static int bpf_object__elf_collect(struct bpf_object *obj, int flags) { Elf *elf = obj->efile.elf; GElf_Ehdr *ep = &obj->efile.ehdr; @@ -811,7 +802,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) data->d_size, name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = str_error(-err, errmsg, sizeof(errmsg)); + char *cp = libbpf_strerror_r(-err, errmsg, + sizeof(errmsg)); pr_warning("failed to alloc program %s (%s): %s", name, obj->path, cp); @@ -854,7 +846,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) return LIBBPF_ERRNO__FORMAT; } if (obj->efile.maps_shndx >= 0) { - err = bpf_object__init_maps(obj); + err = bpf_object__init_maps(obj, flags); if (err) goto out; } @@ -1140,7 +1132,7 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && create_attr.btf_key_type_id) { - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, cp, errno); create_attr.btf_fd = 0; @@ -1155,7 +1147,7 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; err = *pfd; - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("failed to create map (name: '%s'): %s\n", map->name, cp); for (j = 0; j < i; j++) @@ -1306,7 +1298,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) static int load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, const char *name, struct bpf_insn *insns, int insns_cnt, - char *license, u32 kern_version, int *pfd, int prog_ifindex) + char *license, __u32 kern_version, int *pfd, int prog_ifindex) { struct bpf_load_program_attr load_attr; char *cp, errmsg[STRERR_BUFSIZE]; @@ -1339,7 +1331,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, } ret = -LIBBPF_ERRNO__LOAD; - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("load bpf program failed: %s\n", cp); if (log_buf && log_buf[0] != '\0') { @@ -1375,9 +1367,9 @@ out: return ret; } -static int +int bpf_program__load(struct bpf_program *prog, - char *license, u32 kern_version) + char *license, __u32 kern_version) { int err = 0, fd, i; @@ -1502,6 +1494,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: return false; case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_KPROBE: @@ -1525,7 +1518,7 @@ static int bpf_object__validate(struct bpf_object *obj, bool needs_kver) static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, - bool needs_kver) + bool needs_kver, int flags) { struct bpf_object *obj; int err; @@ -1541,7 +1534,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, CHECK_ERR(bpf_object__elf_init(obj), err, out); CHECK_ERR(bpf_object__check_endianness(obj), err, out); - CHECK_ERR(bpf_object__elf_collect(obj), err, out); + CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out); CHECK_ERR(bpf_object__collect_reloc(obj), err, out); CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out); @@ -1552,7 +1545,8 @@ out: return ERR_PTR(err); } -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags) { /* param validation */ if (!attr->file) @@ -1561,7 +1555,13 @@ struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) pr_debug("loading %s\n", attr->file); return __bpf_object__open(attr->file, NULL, 0, - bpf_prog_type__needs_kver(attr->prog_type)); + bpf_prog_type__needs_kver(attr->prog_type), + flags); +} + +struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +{ + return __bpf_object__open_xattr(attr, 0); } struct bpf_object *bpf_object__open(const char *path) @@ -1594,7 +1594,7 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, pr_debug("loading object '%s' from buffer\n", name); - return __bpf_object__open(name, obj_buf, obj_buf_sz, true); + return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true); } int bpf_object__unload(struct bpf_object *obj) @@ -1654,7 +1654,7 @@ static int check_path(const char *path) dir = dirname(dname); if (statfs(dir, &st_fs)) { - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("failed to statfs %s: %s\n", dir, cp); err = -errno; } @@ -1690,7 +1690,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin program: %s\n", cp); return -errno; } @@ -1708,7 +1708,7 @@ static int make_dir(const char *path) err = -errno; if (err) { - cp = str_error(-err, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); pr_warning("failed to mkdir %s: %s\n", path, cp); } return err; @@ -1770,7 +1770,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path) } if (bpf_obj_pin(map->fd, path)) { - cp = str_error(errno, errmsg, sizeof(errmsg)); + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin map: %s\n", cp); return -errno; } @@ -2084,57 +2084,91 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog, prog->expected_attach_type = type; } -#define BPF_PROG_SEC_FULL(string, ptype, atype) \ - { string, sizeof(string) - 1, ptype, atype } +#define BPF_PROG_SEC_IMPL(string, ptype, eatype, is_attachable, atype) \ + { string, sizeof(string) - 1, ptype, eatype, is_attachable, atype } -#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_FULL(string, ptype, 0) +/* Programs that can NOT be attached. */ +#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_IMPL(string, ptype, 0, 0, 0) -#define BPF_S_PROG_SEC(string, ptype) \ - BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK, ptype) +/* Programs that can be attached. */ +#define BPF_APROG_SEC(string, ptype, atype) \ + BPF_PROG_SEC_IMPL(string, ptype, 0, 1, atype) -#define BPF_SA_PROG_SEC(string, ptype) \ - BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ptype) +/* Programs that must specify expected attach type at load time. */ +#define BPF_EAPROG_SEC(string, ptype, eatype) \ + BPF_PROG_SEC_IMPL(string, ptype, eatype, 1, eatype) + +/* Programs that can be attached but attach type can't be identified by section + * name. Kept for backward compatibility. + */ +#define BPF_APROG_COMPAT(string, ptype) BPF_PROG_SEC(string, ptype) static const struct { const char *sec; size_t len; enum bpf_prog_type prog_type; enum bpf_attach_type expected_attach_type; + int is_attachable; + enum bpf_attach_type attach_type; } section_names[] = { - BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), - BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), - BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), - BPF_PROG_SEC("classifier", BPF_PROG_TYPE_SCHED_CLS), - BPF_PROG_SEC("action", BPF_PROG_TYPE_SCHED_ACT), - BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), - BPF_PROG_SEC("raw_tracepoint/", BPF_PROG_TYPE_RAW_TRACEPOINT), - BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), - BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), - BPF_PROG_SEC("cgroup/skb", BPF_PROG_TYPE_CGROUP_SKB), - BPF_PROG_SEC("cgroup/sock", BPF_PROG_TYPE_CGROUP_SOCK), - BPF_PROG_SEC("cgroup/dev", BPF_PROG_TYPE_CGROUP_DEVICE), - BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), - BPF_PROG_SEC("lwt_out", BPF_PROG_TYPE_LWT_OUT), - BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT), - BPF_PROG_SEC("lwt_seg6local", BPF_PROG_TYPE_LWT_SEG6LOCAL), - BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS), - BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), - BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG), - BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2), - BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND), - BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND), - BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT), - BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT), - BPF_SA_PROG_SEC("cgroup/sendmsg4", BPF_CGROUP_UDP4_SENDMSG), - BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG), - BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND), - BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND), + BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), + BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), + BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), + BPF_PROG_SEC("classifier", BPF_PROG_TYPE_SCHED_CLS), + BPF_PROG_SEC("action", BPF_PROG_TYPE_SCHED_ACT), + BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), + BPF_PROG_SEC("raw_tracepoint/", BPF_PROG_TYPE_RAW_TRACEPOINT), + BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), + BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), + BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), + BPF_PROG_SEC("lwt_out", BPF_PROG_TYPE_LWT_OUT), + BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT), + BPF_PROG_SEC("lwt_seg6local", BPF_PROG_TYPE_LWT_SEG6LOCAL), + BPF_APROG_SEC("cgroup_skb/ingress", BPF_PROG_TYPE_CGROUP_SKB, + BPF_CGROUP_INET_INGRESS), + BPF_APROG_SEC("cgroup_skb/egress", BPF_PROG_TYPE_CGROUP_SKB, + BPF_CGROUP_INET_EGRESS), + BPF_APROG_COMPAT("cgroup/skb", BPF_PROG_TYPE_CGROUP_SKB), + BPF_APROG_SEC("cgroup/sock", BPF_PROG_TYPE_CGROUP_SOCK, + BPF_CGROUP_INET_SOCK_CREATE), + BPF_EAPROG_SEC("cgroup/post_bind4", BPF_PROG_TYPE_CGROUP_SOCK, + BPF_CGROUP_INET4_POST_BIND), + BPF_EAPROG_SEC("cgroup/post_bind6", BPF_PROG_TYPE_CGROUP_SOCK, + BPF_CGROUP_INET6_POST_BIND), + BPF_APROG_SEC("cgroup/dev", BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_CGROUP_DEVICE), + BPF_APROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS, + BPF_CGROUP_SOCK_OPS), + BPF_APROG_SEC("sk_skb/stream_parser", BPF_PROG_TYPE_SK_SKB, + BPF_SK_SKB_STREAM_PARSER), + BPF_APROG_SEC("sk_skb/stream_verdict", BPF_PROG_TYPE_SK_SKB, + BPF_SK_SKB_STREAM_VERDICT), + BPF_APROG_COMPAT("sk_skb", BPF_PROG_TYPE_SK_SKB), + BPF_APROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG, + BPF_SK_MSG_VERDICT), + BPF_APROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2, + BPF_LIRC_MODE2), + BPF_APROG_SEC("flow_dissector", BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_FLOW_DISSECTOR), + BPF_EAPROG_SEC("cgroup/bind4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_BIND), + BPF_EAPROG_SEC("cgroup/bind6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_BIND), + BPF_EAPROG_SEC("cgroup/connect4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_CONNECT), + BPF_EAPROG_SEC("cgroup/connect6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_CONNECT), + BPF_EAPROG_SEC("cgroup/sendmsg4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_UDP4_SENDMSG), + BPF_EAPROG_SEC("cgroup/sendmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_UDP6_SENDMSG), }; +#undef BPF_PROG_SEC_IMPL #undef BPF_PROG_SEC -#undef BPF_PROG_SEC_FULL -#undef BPF_S_PROG_SEC -#undef BPF_SA_PROG_SEC +#undef BPF_APROG_SEC +#undef BPF_EAPROG_SEC +#undef BPF_APROG_COMPAT int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type) @@ -2154,6 +2188,25 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return -EINVAL; } +int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type) +{ + int i; + + if (!name) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(section_names); i++) { + if (strncmp(name, section_names[i].sec, section_names[i].len)) + continue; + if (!section_names[i].is_attachable) + return -EINVAL; + *attach_type = section_names[i].attach_type; + return 0; + } + return -EINVAL; +} + static int bpf_program__identify_section(struct bpf_program *prog, enum bpf_prog_type *prog_type, @@ -2336,7 +2389,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, bpf_program__set_expected_attach_type(prog, expected_attach_type); - if (!bpf_program__is_function_storage(prog, obj) && !first_prog) + if (!first_prog) first_prog = prog; } @@ -2363,61 +2416,49 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, } enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mem, unsigned long size, - unsigned long page_size, void **buf, size_t *buf_len, - bpf_perf_event_print_t fn, void *priv) +bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data) { - volatile struct perf_event_mmap_page *header = mem; + struct perf_event_mmap_page *header = mmap_mem; + __u64 data_head = ring_buffer_read_head(header); __u64 data_tail = header->data_tail; - __u64 data_head = header->data_head; - int ret = LIBBPF_PERF_EVENT_ERROR; - void *base, *begin, *end; - - asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ - if (data_head == data_tail) - return LIBBPF_PERF_EVENT_CONT; - - base = ((char *)header) + page_size; - - begin = base + data_tail % size; - end = base + data_head % size; - - while (begin != end) { - struct perf_event_header *ehdr; - - ehdr = begin; - if (begin + ehdr->size > base + size) { - long len = base + size - begin; - - if (*buf_len < ehdr->size) { - free(*buf); - *buf = malloc(ehdr->size); - if (!*buf) { + void *base = ((__u8 *)header) + page_size; + int ret = LIBBPF_PERF_EVENT_CONT; + struct perf_event_header *ehdr; + size_t ehdr_size; + + while (data_head != data_tail) { + ehdr = base + (data_tail & (mmap_size - 1)); + ehdr_size = ehdr->size; + + if (((void *)ehdr) + ehdr_size > base + mmap_size) { + void *copy_start = ehdr; + size_t len_first = base + mmap_size - copy_start; + size_t len_secnd = ehdr_size - len_first; + + if (*copy_size < ehdr_size) { + free(*copy_mem); + *copy_mem = malloc(ehdr_size); + if (!*copy_mem) { + *copy_size = 0; ret = LIBBPF_PERF_EVENT_ERROR; break; } - *buf_len = ehdr->size; + *copy_size = ehdr_size; } - memcpy(*buf, begin, len); - memcpy(*buf + len, base, ehdr->size - len); - ehdr = (void *)*buf; - begin = base + ehdr->size - len; - } else if (begin + ehdr->size == base + size) { - begin = base; - } else { - begin += ehdr->size; + memcpy(*copy_mem, copy_start, len_first); + memcpy(*copy_mem + len_first, base, len_secnd); + ehdr = *copy_mem; } - ret = fn(ehdr, priv); + ret = fn(ehdr, private_data); + data_tail += ehdr_size; if (ret != LIBBPF_PERF_EVENT_CONT) break; - - data_tail += ehdr->size; } - __sync_synchronize(); /* smp_mb() */ - header->data_tail = data_tail; - + ring_buffer_write_tail(header, data_tail); return ret; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 96c55fac54c3..1f3468dad8b2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: LGPL-2.1 */ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* * Common eBPF ELF object loading operations. @@ -6,22 +6,9 @@ * Copyright (C) 2013-2015 Alexei Starovoitov * Copyright (C) 2015 Wang Nan * Copyright (C) 2015 Huawei Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License (not later!) - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see */ -#ifndef __BPF_LIBBPF_H -#define __BPF_LIBBPF_H +#ifndef __LIBBPF_LIBBPF_H +#define __LIBBPF_LIBBPF_H #include #include @@ -29,6 +16,10 @@ #include // for size_t #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + enum libbpf_errno { __LIBBPF_ERRNO__START = 4000, @@ -46,10 +37,11 @@ enum libbpf_errno { LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ __LIBBPF_ERRNO__END, }; -int libbpf_strerror(int err, char *buf, size_t size); +LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); /* * __printf is defined in include/linux/compiler-gcc.h. However, @@ -59,9 +51,9 @@ int libbpf_strerror(int err, char *buf, size_t size); typedef int (*libbpf_print_fn_t)(const char *, ...) __attribute__((format(printf, 1, 2))); -void libbpf_set_print(libbpf_print_fn_t warn, - libbpf_print_fn_t info, - libbpf_print_fn_t debug); +LIBBPF_API void libbpf_set_print(libbpf_print_fn_t warn, + libbpf_print_fn_t info, + libbpf_print_fn_t debug); /* Hide internal to user */ struct bpf_object; @@ -71,25 +63,28 @@ struct bpf_object_open_attr { enum bpf_prog_type prog_type; }; -struct bpf_object *bpf_object__open(const char *path); -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr); -struct bpf_object *bpf_object__open_buffer(void *obj_buf, - size_t obj_buf_sz, - const char *name); -int bpf_object__pin(struct bpf_object *object, const char *path); -void bpf_object__close(struct bpf_object *object); +LIBBPF_API struct bpf_object *bpf_object__open(const char *path); +LIBBPF_API struct bpf_object * +bpf_object__open_xattr(struct bpf_object_open_attr *attr); +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags); +LIBBPF_API struct bpf_object *bpf_object__open_buffer(void *obj_buf, + size_t obj_buf_sz, + const char *name); +LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); +LIBBPF_API void bpf_object__close(struct bpf_object *object); /* Load/unload object into/from kernel */ -int bpf_object__load(struct bpf_object *obj); -int bpf_object__unload(struct bpf_object *obj); -const char *bpf_object__name(struct bpf_object *obj); -unsigned int bpf_object__kversion(struct bpf_object *obj); -int bpf_object__btf_fd(const struct bpf_object *obj); +LIBBPF_API int bpf_object__load(struct bpf_object *obj); +LIBBPF_API int bpf_object__unload(struct bpf_object *obj); +LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); +LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); +LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); -struct bpf_program * +LIBBPF_API struct bpf_program * bpf_object__find_program_by_title(struct bpf_object *obj, const char *title); -struct bpf_object *bpf_object__next(struct bpf_object *prev); +LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp) \ for ((pos) = bpf_object__next(NULL), \ (tmp) = bpf_object__next(pos); \ @@ -97,17 +92,20 @@ struct bpf_object *bpf_object__next(struct bpf_object *prev); (pos) = (tmp), (tmp) = bpf_object__next(tmp)) typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); -int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv); -void *bpf_object__priv(struct bpf_object *prog); +LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, + bpf_object_clear_priv_t clear_priv); +LIBBPF_API void *bpf_object__priv(struct bpf_object *prog); -int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, - enum bpf_attach_type *expected_attach_type); +LIBBPF_API int +libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type); +LIBBPF_API int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type); /* Accessors of bpf_program */ struct bpf_program; -struct bpf_program *bpf_program__next(struct bpf_program *prog, - struct bpf_object *obj); +LIBBPF_API struct bpf_program *bpf_program__next(struct bpf_program *prog, + struct bpf_object *obj); #define bpf_object__for_each_program(pos, obj) \ for ((pos) = bpf_program__next(NULL, (obj)); \ @@ -117,18 +115,24 @@ struct bpf_program *bpf_program__next(struct bpf_program *prog, typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); -int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv); +LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, + bpf_program_clear_priv_t clear_priv); -void *bpf_program__priv(struct bpf_program *prog); -void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); +LIBBPF_API void *bpf_program__priv(struct bpf_program *prog); +LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, + __u32 ifindex); -const char *bpf_program__title(struct bpf_program *prog, bool needs_copy); +LIBBPF_API const char *bpf_program__title(struct bpf_program *prog, + bool needs_copy); -int bpf_program__fd(struct bpf_program *prog); -int bpf_program__pin_instance(struct bpf_program *prog, const char *path, - int instance); -int bpf_program__pin(struct bpf_program *prog, const char *path); +LIBBPF_API int bpf_program__load(struct bpf_program *prog, char *license, + __u32 kern_version); +LIBBPF_API int bpf_program__fd(struct bpf_program *prog); +LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, + const char *path, + int instance); +LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); +LIBBPF_API void bpf_program__unload(struct bpf_program *prog); struct bpf_insn; @@ -189,34 +193,36 @@ typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, struct bpf_insn *insns, int insns_cnt, struct bpf_prog_prep_result *res); -int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, - bpf_program_prep_t prep); +LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, + bpf_program_prep_t prep); -int bpf_program__nth_fd(struct bpf_program *prog, int n); +LIBBPF_API int bpf_program__nth_fd(struct bpf_program *prog, int n); /* * Adjust type of BPF program. Default is kprobe. */ -int bpf_program__set_socket_filter(struct bpf_program *prog); -int bpf_program__set_tracepoint(struct bpf_program *prog); -int bpf_program__set_raw_tracepoint(struct bpf_program *prog); -int bpf_program__set_kprobe(struct bpf_program *prog); -int bpf_program__set_sched_cls(struct bpf_program *prog); -int bpf_program__set_sched_act(struct bpf_program *prog); -int bpf_program__set_xdp(struct bpf_program *prog); -int bpf_program__set_perf_event(struct bpf_program *prog); -void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); -void bpf_program__set_expected_attach_type(struct bpf_program *prog, - enum bpf_attach_type type); - -bool bpf_program__is_socket_filter(struct bpf_program *prog); -bool bpf_program__is_tracepoint(struct bpf_program *prog); -bool bpf_program__is_raw_tracepoint(struct bpf_program *prog); -bool bpf_program__is_kprobe(struct bpf_program *prog); -bool bpf_program__is_sched_cls(struct bpf_program *prog); -bool bpf_program__is_sched_act(struct bpf_program *prog); -bool bpf_program__is_xdp(struct bpf_program *prog); -bool bpf_program__is_perf_event(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); +LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, + enum bpf_prog_type type); +LIBBPF_API void +bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type); + +LIBBPF_API bool bpf_program__is_socket_filter(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_tracepoint(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_raw_tracepoint(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_kprobe(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched_cls(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched_act(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_xdp(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_perf_event(struct bpf_program *prog); /* * No need for __attribute__((packed)), all members of 'bpf_map_def' @@ -237,39 +243,39 @@ struct bpf_map_def { * so no need to worry about a name clash. */ struct bpf_map; -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_object__find_map_by_name(struct bpf_object *obj, const char *name); /* * Get bpf_map through the offset of corresponding struct bpf_map_def * in the BPF object file. */ -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_map__next(struct bpf_map *map, struct bpf_object *obj); #define bpf_map__for_each(pos, obj) \ for ((pos) = bpf_map__next(NULL, (obj)); \ (pos) != NULL; \ (pos) = bpf_map__next((pos), (obj))) -int bpf_map__fd(struct bpf_map *map); -const struct bpf_map_def *bpf_map__def(struct bpf_map *map); -const char *bpf_map__name(struct bpf_map *map); -__u32 bpf_map__btf_key_type_id(const struct bpf_map *map); -__u32 bpf_map__btf_value_type_id(const struct bpf_map *map); +LIBBPF_API int bpf_map__fd(struct bpf_map *map); +LIBBPF_API const struct bpf_map_def *bpf_map__def(struct bpf_map *map); +LIBBPF_API const char *bpf_map__name(struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); -int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv); -void *bpf_map__priv(struct bpf_map *map); -int bpf_map__reuse_fd(struct bpf_map *map, int fd); -bool bpf_map__is_offload_neutral(struct bpf_map *map); -void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); -int bpf_map__pin(struct bpf_map *map, const char *path); +LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, + bpf_map_clear_priv_t clear_priv); +LIBBPF_API void *bpf_map__priv(struct bpf_map *map); +LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map); +LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); -long libbpf_get_error(const void *ptr); +LIBBPF_API long libbpf_get_error(const void *ptr); struct bpf_prog_load_attr { const char *file; @@ -278,12 +284,12 @@ struct bpf_prog_load_attr { int ifindex; }; -int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); -int bpf_prog_load(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd); +LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, + struct bpf_object **pobj, int *prog_fd); +LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type, + struct bpf_object **pobj, int *prog_fd); -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); +LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, @@ -291,10 +297,24 @@ enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_CONT = -2, }; -typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(void *event, - void *priv); -int bpf_perf_event_read_simple(void *mem, unsigned long size, - unsigned long page_size, - void **buf, size_t *buf_len, - bpf_perf_event_print_t fn, void *priv); -#endif +struct perf_event_header; +typedef enum bpf_perf_event_ret + (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); +LIBBPF_API enum bpf_perf_event_ret +bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data); + +struct nlattr; +typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); +int libbpf_netlink_open(unsigned int *nl_pid); +int libbpf_nl_get_link(int sock, unsigned int nl_pid, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie); +int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie); +int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie); +int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie); +#endif /* __LIBBPF_LIBBPF_H */ diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c index d9ba851bd7f9..d83b17f8435c 100644 --- a/tools/lib/bpf/libbpf_errno.c +++ b/tools/lib/bpf/libbpf_errno.c @@ -1,23 +1,10 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * Copyright (C) 2013-2015 Alexei Starovoitov * Copyright (C) 2015 Wang Nan * Copyright (C) 2015 Huawei Inc. * Copyright (C) 2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License (not later!) - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see */ #include @@ -42,6 +29,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", }; int libbpf_strerror(int err, char *buf, size_t size) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c new file mode 100644 index 000000000000..0ce67aea8f3b --- /dev/null +++ b/tools/lib/bpf/netlink.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, + void *cookie); + +int libbpf_netlink_open(__u32 *nl_pid) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + fprintf(stderr, "Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + char buf[4096]; + int len, ret; + + while (multipart) { + multipart = false; + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + libbpf_nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + return 0; + default: + break; + } + if (_fn) { + ret = _fn(nh, fn, cookie); + if (ret) + return ret; + } + } + } + ret = 0; +done: + return ret; +} + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + int sock, seq = 0, ret; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + __u32 nl_pid; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL); + +cleanup: + close(sock); + return ret; +} + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +int libbpf_nl_get_link(int sock, unsigned int nl_pid, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifm.ifi_family = AF_PACKET, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg, + dump_link_nlmsg, cookie); +} + +static int __dump_class_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_class_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_class_nlmsg(cookie, t, tb); +} + +int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTCLASS, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg, + dump_class_nlmsg, cookie); +} + +static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_qdisc_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_qdisc_nlmsg(cookie, t, tb); +} + +int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETQDISC, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg, + dump_qdisc_nlmsg, cookie); +} + +static int __dump_filter_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_filter_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_filter_nlmsg(cookie, t, tb); +} + +int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTFILTER, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + .t.tcm_parent = handle, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg, + dump_filter_nlmsg, cookie); +} diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index 4719434278b2..1e69c0c8d413 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -1,13 +1,8 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * NETLINK Netlink attributes * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation version 2.1 - * of the License. - * * Copyright (c) 2003-2013 Thomas Graf */ @@ -17,20 +12,15 @@ #include #include -static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = { - [NLA_U8] = sizeof(uint8_t), - [NLA_U16] = sizeof(uint16_t), - [NLA_U32] = sizeof(uint32_t), - [NLA_U64] = sizeof(uint64_t), - [NLA_STRING] = 1, - [NLA_FLAG] = 0, +static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = { + [LIBBPF_NLA_U8] = sizeof(uint8_t), + [LIBBPF_NLA_U16] = sizeof(uint16_t), + [LIBBPF_NLA_U32] = sizeof(uint32_t), + [LIBBPF_NLA_U64] = sizeof(uint64_t), + [LIBBPF_NLA_STRING] = 1, + [LIBBPF_NLA_FLAG] = 0, }; -static int nla_len(const struct nlattr *nla) -{ - return nla->nla_len - NLA_HDRLEN; -} - static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { int totlen = NLA_ALIGN(nla->nla_len); @@ -46,20 +36,15 @@ static int nla_ok(const struct nlattr *nla, int remaining) nla->nla_len <= remaining; } -static void *nla_data(const struct nlattr *nla) -{ - return (char *) nla + NLA_HDRLEN; -} - static int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } static int validate_nla(struct nlattr *nla, int maxtype, - struct nla_policy *policy) + struct libbpf_nla_policy *policy) { - struct nla_policy *pt; + struct libbpf_nla_policy *pt; unsigned int minlen = 0; int type = nla_type(nla); @@ -68,23 +53,24 @@ static int validate_nla(struct nlattr *nla, int maxtype, pt = &policy[type]; - if (pt->type > NLA_TYPE_MAX) + if (pt->type > LIBBPF_NLA_TYPE_MAX) return 0; if (pt->minlen) minlen = pt->minlen; - else if (pt->type != NLA_UNSPEC) + else if (pt->type != LIBBPF_NLA_UNSPEC) minlen = nla_attr_minlen[pt->type]; - if (nla_len(nla) < minlen) + if (libbpf_nla_len(nla) < minlen) return -1; - if (pt->maxlen && nla_len(nla) > pt->maxlen) + if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen) return -1; - if (pt->type == NLA_STRING) { - char *data = nla_data(nla); - if (data[nla_len(nla) - 1] != '\0') + if (pt->type == LIBBPF_NLA_STRING) { + char *data = libbpf_nla_data(nla); + + if (data[libbpf_nla_len(nla) - 1] != '\0') return -1; } @@ -114,15 +100,15 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) * @see nla_validate * @return 0 on success or a negative error code. */ -static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, - struct nla_policy *policy) +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy) { struct nlattr *nla; int rem, err; memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); - nla_for_each_attr(nla, head, len, rem) { + libbpf_nla_for_each_attr(nla, head, len, rem) { int type = nla_type(nla); if (type > maxtype) @@ -146,12 +132,33 @@ errout: return err; } +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to libbpf_nla_parse(). + * + * @see libbpf_nla_parse + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy) +{ + return libbpf_nla_parse(tb, maxtype, libbpf_nla_data(nla), + libbpf_nla_len(nla), policy); +} + /* dump netlink extended ack error message */ -int nla_dump_errormsg(struct nlmsghdr *nlh) +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) { - struct nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { - [NLMSGERR_ATTR_MSG] = { .type = NLA_STRING }, - [NLMSGERR_ATTR_OFFS] = { .type = NLA_U32 }, + struct libbpf_nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { + [NLMSGERR_ATTR_MSG] = { .type = LIBBPF_NLA_STRING }, + [NLMSGERR_ATTR_OFFS] = { .type = LIBBPF_NLA_U32 }, }; struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr; struct nlmsgerr *err; @@ -172,14 +179,15 @@ int nla_dump_errormsg(struct nlmsghdr *nlh) attr = (struct nlattr *) ((void *) err + hlen); alen = nlh->nlmsg_len - hlen; - if (nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) { + if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, + extack_policy) != 0) { fprintf(stderr, "Failed to parse extended error attributes\n"); return 0; } if (tb[NLMSGERR_ATTR_MSG]) - errmsg = (char *) nla_data(tb[NLMSGERR_ATTR_MSG]); + errmsg = (char *) libbpf_nla_data(tb[NLMSGERR_ATTR_MSG]); fprintf(stderr, "Kernel error message: %s\n", errmsg); diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 931a71f68f93..6cc3ac91690f 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -1,18 +1,13 @@ -/* SPDX-License-Identifier: LGPL-2.1 */ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* * NETLINK Netlink attributes * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation version 2.1 - * of the License. - * * Copyright (c) 2003-2013 Thomas Graf */ -#ifndef __NLATTR_H -#define __NLATTR_H +#ifndef __LIBBPF_NLATTR_H +#define __LIBBPF_NLATTR_H #include #include @@ -23,19 +18,19 @@ * Standard attribute types to specify validation policy */ enum { - NLA_UNSPEC, /**< Unspecified type, binary data chunk */ - NLA_U8, /**< 8 bit integer */ - NLA_U16, /**< 16 bit integer */ - NLA_U32, /**< 32 bit integer */ - NLA_U64, /**< 64 bit integer */ - NLA_STRING, /**< NUL terminated character string */ - NLA_FLAG, /**< Flag */ - NLA_MSECS, /**< Micro seconds (64bit) */ - NLA_NESTED, /**< Nested attributes */ - __NLA_TYPE_MAX, + LIBBPF_NLA_UNSPEC, /**< Unspecified type, binary data chunk */ + LIBBPF_NLA_U8, /**< 8 bit integer */ + LIBBPF_NLA_U16, /**< 16 bit integer */ + LIBBPF_NLA_U32, /**< 32 bit integer */ + LIBBPF_NLA_U64, /**< 64 bit integer */ + LIBBPF_NLA_STRING, /**< NUL terminated character string */ + LIBBPF_NLA_FLAG, /**< Flag */ + LIBBPF_NLA_MSECS, /**< Micro seconds (64bit) */ + LIBBPF_NLA_NESTED, /**< Nested attributes */ + __LIBBPF_NLA_TYPE_MAX, }; -#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) +#define LIBBPF_NLA_TYPE_MAX (__LIBBPF_NLA_TYPE_MAX - 1) /** * @ingroup attr @@ -43,8 +38,8 @@ enum { * * See section @core_doc{core_attr_parse,Attribute Parsing} for more details. */ -struct nla_policy { - /** Type of attribute or NLA_UNSPEC */ +struct libbpf_nla_policy { + /** Type of attribute or LIBBPF_NLA_UNSPEC */ uint16_t type; /** Minimal length of payload required */ @@ -62,11 +57,50 @@ struct nla_policy { * @arg len length of attribute stream * @arg rem initialized to len, holds bytes currently remaining in stream */ -#define nla_for_each_attr(pos, head, len, rem) \ +#define libbpf_nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) -int nla_dump_errormsg(struct nlmsghdr *nlh); +/** + * libbpf_nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *libbpf_nla_data(const struct nlattr *nla) +{ + return (char *) nla + NLA_HDRLEN; +} + +static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)libbpf_nla_data(nla); +} + +static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)libbpf_nla_data(nla); +} + +static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)libbpf_nla_data(nla); +} + +/** + * libbpf_nla_len - length of payload + * @nla: netlink attribute + */ +static inline int libbpf_nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy); +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy); + +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); -#endif /* __NLATTR_H */ +#endif /* __LIBBPF_NLATTR_H */ diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index b8798114a357..00e48ac5b806 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) #undef _GNU_SOURCE #include #include @@ -9,7 +9,7 @@ * libc, while checking strerror_r() return to avoid having to check this in * all places calling it. */ -char *str_error(int err, char *dst, int len) +char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err, dst, len); if (ret) diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h index 355b1db571d1..a139334d57b6 100644 --- a/tools/lib/bpf/str_error.h +++ b/tools/lib/bpf/str_error.h @@ -1,6 +1,6 @@ -// SPDX-License-Identifier: LGPL-2.1 -#ifndef BPF_STR_ERROR -#define BPF_STR_ERROR +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBBPF_STR_ERROR_H +#define __LIBBPF_STR_ERROR_H -char *str_error(int err, char *dst, int len); -#endif // BPF_STR_ERROR +char *libbpf_strerror_r(int err, char *dst, int len); +#endif /* __LIBBPF_STR_ERROR_H */ diff --git a/tools/lib/subcmd/pager.c b/tools/lib/subcmd/pager.c index 9997a8805a82..e3d47b59b14d 100644 --- a/tools/lib/subcmd/pager.c +++ b/tools/lib/subcmd/pager.c @@ -23,6 +23,13 @@ void pager_init(const char *pager_env) subcmd_config.pager_env = pager_env; } +static const char *forced_pager; + +void force_pager(const char *pager) +{ + forced_pager = pager; +} + static void pager_preexec(void) { /* @@ -66,7 +73,9 @@ void setup_pager(void) const char *pager = getenv(subcmd_config.pager_env); struct winsize sz; - if (!isatty(1)) + if (forced_pager) + pager = forced_pager; + if (!isatty(1) && !forced_pager) return; if (ioctl(1, TIOCGWINSZ, &sz) == 0) pager_columns = sz.ws_col; diff --git a/tools/lib/subcmd/pager.h b/tools/lib/subcmd/pager.h index f1a53cf29880..a818964693ab 100644 --- a/tools/lib/subcmd/pager.h +++ b/tools/lib/subcmd/pager.h @@ -7,5 +7,6 @@ extern void pager_init(const char *pager_env); extern void setup_pager(void); extern int pager_in_use(void); extern int pager_get_columns(void); +extern void force_pager(const char *); #endif /* __SUBCMD_PAGER_H */ diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index cb7154eccbdc..dbb9efbf718a 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -116,6 +116,7 @@ static int get_value(struct parse_opt_ctx_t *p, case OPTION_INTEGER: case OPTION_UINTEGER: case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: default: break; @@ -166,6 +167,7 @@ static int get_value(struct parse_opt_ctx_t *p, case OPTION_INTEGER: case OPTION_UINTEGER: case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: default: break; @@ -295,6 +297,22 @@ static int get_value(struct parse_opt_ctx_t *p, return opterror(opt, "expects a numerical value", flags); return 0; + case OPTION_ULONG: + if (unset) { + *(unsigned long *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(unsigned long *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + *(unsigned long *)opt->value = strtoul(arg, (char **)&s, 10); + if (*s) + return opterror(opt, "expects a numerical value", flags); + return 0; + case OPTION_U64: if (unset) { *(u64 *)opt->value = 0; @@ -703,6 +721,7 @@ static void print_option_help(const struct option *opts, int full) case OPTION_ARGUMENT: break; case OPTION_LONG: + case OPTION_ULONG: case OPTION_U64: case OPTION_INTEGER: case OPTION_UINTEGER: diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index 92fdbe1519f6..6ca2a8bfe716 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -25,6 +25,7 @@ enum parse_opt_type { OPTION_STRING, OPTION_INTEGER, OPTION_LONG, + OPTION_ULONG, OPTION_CALLBACK, OPTION_U64, OPTION_UINTEGER, @@ -133,6 +134,7 @@ struct option { #define OPT_INTEGER(s, l, v, h) { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) } #define OPT_UINTEGER(s, l, v, h) { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) } #define OPT_LONG(s, l, v, h) { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) } +#define OPT_ULONG(s, l, v, h) { .type = OPTION_ULONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned long *), .help = (h) } #define OPT_U64(s, l, v, h) { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) } #define OPT_STRING(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), .argh = (a), .help = (h) } #define OPT_STRING_OPTARG(s, l, v, a, h, d) \ diff --git a/tools/lib/traceevent/Build b/tools/lib/traceevent/Build index c681d0575d16..ba54bfce0b0b 100644 --- a/tools/lib/traceevent/Build +++ b/tools/lib/traceevent/Build @@ -4,6 +4,8 @@ libtraceevent-y += trace-seq.o libtraceevent-y += parse-filter.o libtraceevent-y += parse-utils.o libtraceevent-y += kbuffer-parse.o +libtraceevent-y += tep_strerror.o +libtraceevent-y += event-parse-api.o plugin_jbd2-y += plugin_jbd2.o plugin_hrtimer-y += plugin_hrtimer.o diff --git a/tools/lib/traceevent/event-parse-api.c b/tools/lib/traceevent/event-parse-api.c new file mode 100644 index 000000000000..61f7149085ee --- /dev/null +++ b/tools/lib/traceevent/event-parse-api.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt + * + */ + +#include "event-parse.h" +#include "event-parse-local.h" +#include "event-utils.h" + +/** + * tep_get_first_event - returns the first event in the events array + * @tep: a handle to the tep_handle + * + * This returns pointer to the first element of the events array + * If @tep is NULL, NULL is returned. + */ +struct tep_event_format *tep_get_first_event(struct tep_handle *tep) +{ + if (tep && tep->events) + return tep->events[0]; + + return NULL; +} + +/** + * tep_get_events_count - get the number of defined events + * @tep: a handle to the tep_handle + * + * This returns number of elements in event array + * If @tep is NULL, 0 is returned. + */ +int tep_get_events_count(struct tep_handle *tep) +{ + if(tep) + return tep->nr_events; + return 0; +} + +/** + * tep_set_flag - set event parser flag + * @tep: a handle to the tep_handle + * @flag: flag, or combination of flags to be set + * can be any combination from enum tep_flag + * + * This sets a flag or mbination of flags from enum tep_flag + */ +void tep_set_flag(struct tep_handle *tep, int flag) +{ + if(tep) + tep->flags |= flag; +} + +unsigned short __tep_data2host2(struct tep_handle *pevent, unsigned short data) +{ + unsigned short swap; + + if (!pevent || pevent->host_bigendian == pevent->file_bigendian) + return data; + + swap = ((data & 0xffULL) << 8) | + ((data & (0xffULL << 8)) >> 8); + + return swap; +} + +unsigned int __tep_data2host4(struct tep_handle *pevent, unsigned int data) +{ + unsigned int swap; + + if (!pevent || pevent->host_bigendian == pevent->file_bigendian) + return data; + + swap = ((data & 0xffULL) << 24) | + ((data & (0xffULL << 8)) << 8) | + ((data & (0xffULL << 16)) >> 8) | + ((data & (0xffULL << 24)) >> 24); + + return swap; +} + +unsigned long long +__tep_data2host8(struct tep_handle *pevent, unsigned long long data) +{ + unsigned long long swap; + + if (!pevent || pevent->host_bigendian == pevent->file_bigendian) + return data; + + swap = ((data & 0xffULL) << 56) | + ((data & (0xffULL << 8)) << 40) | + ((data & (0xffULL << 16)) << 24) | + ((data & (0xffULL << 24)) << 8) | + ((data & (0xffULL << 32)) >> 8) | + ((data & (0xffULL << 40)) >> 24) | + ((data & (0xffULL << 48)) >> 40) | + ((data & (0xffULL << 56)) >> 56); + + return swap; +} + +/** + * tep_get_header_page_size - get size of the header page + * @pevent: a handle to the tep_handle + * + * This returns size of the header page + * If @pevent is NULL, 0 is returned. + */ +int tep_get_header_page_size(struct tep_handle *pevent) +{ + if(pevent) + return pevent->header_page_size_size; + return 0; +} + +/** + * tep_get_cpus - get the number of CPUs + * @pevent: a handle to the tep_handle + * + * This returns the number of CPUs + * If @pevent is NULL, 0 is returned. + */ +int tep_get_cpus(struct tep_handle *pevent) +{ + if(pevent) + return pevent->cpus; + return 0; +} + +/** + * tep_set_cpus - set the number of CPUs + * @pevent: a handle to the tep_handle + * + * This sets the number of CPUs + */ +void tep_set_cpus(struct tep_handle *pevent, int cpus) +{ + if(pevent) + pevent->cpus = cpus; +} + +/** + * tep_get_long_size - get the size of a long integer on the current machine + * @pevent: a handle to the tep_handle + * + * This returns the size of a long integer on the current machine + * If @pevent is NULL, 0 is returned. + */ +int tep_get_long_size(struct tep_handle *pevent) +{ + if(pevent) + return pevent->long_size; + return 0; +} + +/** + * tep_set_long_size - set the size of a long integer on the current machine + * @pevent: a handle to the tep_handle + * @size: size, in bytes, of a long integer + * + * This sets the size of a long integer on the current machine + */ +void tep_set_long_size(struct tep_handle *pevent, int long_size) +{ + if(pevent) + pevent->long_size = long_size; +} + +/** + * tep_get_page_size - get the size of a memory page on the current machine + * @pevent: a handle to the tep_handle + * + * This returns the size of a memory page on the current machine + * If @pevent is NULL, 0 is returned. + */ +int tep_get_page_size(struct tep_handle *pevent) +{ + if(pevent) + return pevent->page_size; + return 0; +} + +/** + * tep_set_page_size - set the size of a memory page on the current machine + * @pevent: a handle to the tep_handle + * @_page_size: size of a memory page, in bytes + * + * This sets the size of a memory page on the current machine + */ +void tep_set_page_size(struct tep_handle *pevent, int _page_size) +{ + if(pevent) + pevent->page_size = _page_size; +} + +/** + * tep_is_file_bigendian - get if the file is in big endian order + * @pevent: a handle to the tep_handle + * + * This returns if the file is in big endian order + * If @pevent is NULL, 0 is returned. + */ +int tep_is_file_bigendian(struct tep_handle *pevent) +{ + if(pevent) + return pevent->file_bigendian; + return 0; +} + +/** + * tep_set_file_bigendian - set if the file is in big endian order + * @pevent: a handle to the tep_handle + * @endian: non zero, if the file is in big endian order + * + * This sets if the file is in big endian order + */ +void tep_set_file_bigendian(struct tep_handle *pevent, enum tep_endian endian) +{ + if(pevent) + pevent->file_bigendian = endian; +} + +/** + * tep_is_host_bigendian - get if the order of the current host is big endian + * @pevent: a handle to the tep_handle + * + * This gets if the order of the current host is big endian + * If @pevent is NULL, 0 is returned. + */ +int tep_is_host_bigendian(struct tep_handle *pevent) +{ + if(pevent) + return pevent->host_bigendian; + return 0; +} + +/** + * tep_set_host_bigendian - set the order of the local host + * @pevent: a handle to the tep_handle + * @endian: non zero, if the local host has big endian order + * + * This sets the order of the local host + */ +void tep_set_host_bigendian(struct tep_handle *pevent, enum tep_endian endian) +{ + if(pevent) + pevent->host_bigendian = endian; +} + +/** + * tep_is_latency_format - get if the latency output format is configured + * @pevent: a handle to the tep_handle + * + * This gets if the latency output format is configured + * If @pevent is NULL, 0 is returned. + */ +int tep_is_latency_format(struct tep_handle *pevent) +{ + if(pevent) + return pevent->latency_format; + return 0; +} + +/** + * tep_set_latency_format - set the latency output format + * @pevent: a handle to the tep_handle + * @lat: non zero for latency output format + * + * This sets the latency output format + */ +void tep_set_latency_format(struct tep_handle *pevent, int lat) +{ + if(pevent) + pevent->latency_format = lat; +} diff --git a/tools/lib/traceevent/event-parse-local.h b/tools/lib/traceevent/event-parse-local.h new file mode 100644 index 000000000000..b9bddde577f8 --- /dev/null +++ b/tools/lib/traceevent/event-parse-local.h @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt + * + */ + +#ifndef _PARSE_EVENTS_INT_H +#define _PARSE_EVENTS_INT_H + +struct cmdline; +struct cmdline_list; +struct func_map; +struct func_list; +struct event_handler; +struct func_resolver; + +struct tep_handle { + int ref_count; + + int header_page_ts_offset; + int header_page_ts_size; + int header_page_size_offset; + int header_page_size_size; + int header_page_data_offset; + int header_page_data_size; + int header_page_overwrite; + + enum tep_endian file_bigendian; + enum tep_endian host_bigendian; + + int latency_format; + + int old_format; + + int cpus; + int long_size; + int page_size; + + struct cmdline *cmdlines; + struct cmdline_list *cmdlist; + int cmdline_count; + + struct func_map *func_map; + struct func_resolver *func_resolver; + struct func_list *funclist; + unsigned int func_count; + + struct printk_map *printk_map; + struct printk_list *printklist; + unsigned int printk_count; + + + struct tep_event_format **events; + int nr_events; + struct tep_event_format **sort_events; + enum tep_event_sort_type last_type; + + int type_offset; + int type_size; + + int pid_offset; + int pid_size; + + int pc_offset; + int pc_size; + + int flags_offset; + int flags_size; + + int ld_offset; + int ld_size; + + int print_raw; + + int test_filters; + + int flags; + + struct tep_format_field *bprint_ip_field; + struct tep_format_field *bprint_fmt_field; + struct tep_format_field *bprint_buf_field; + + struct event_handler *handlers; + struct tep_function_handler *func_handlers; + + /* cache */ + struct tep_event_format *last_event; + + char *trace_clock; +}; + +#endif /* _PARSE_EVENTS_INT_H */ diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index ce1e20227c64..3692f29fee46 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -18,12 +18,14 @@ #include #include #include -#include #include #include #include "event-parse.h" + +#include "event-parse-local.h" #include "event-utils.h" +#include "trace-seq.h" static const char *input_buf; static unsigned long long input_buf_ptr; @@ -94,7 +96,7 @@ struct tep_function_handler { static unsigned long long process_defined_func(struct trace_seq *s, void *data, int size, - struct event_format *event, struct print_arg *arg); + struct tep_event_format *event, struct tep_print_arg *arg); static void free_func_handle(struct tep_function_handler *func); @@ -117,9 +119,9 @@ void breakpoint(void) x++; } -struct print_arg *alloc_arg(void) +struct tep_print_arg *alloc_arg(void) { - return calloc(1, sizeof(struct print_arg)); + return calloc(1, sizeof(struct tep_print_arg)); } struct cmdline { @@ -737,16 +739,16 @@ void tep_print_printk(struct tep_handle *pevent) } } -static struct event_format *alloc_event(void) +static struct tep_event_format *alloc_event(void) { - return calloc(1, sizeof(struct event_format)); + return calloc(1, sizeof(struct tep_event_format)); } -static int add_event(struct tep_handle *pevent, struct event_format *event) +static int add_event(struct tep_handle *pevent, struct tep_event_format *event) { int i; - struct event_format **events = realloc(pevent->events, sizeof(event) * - (pevent->nr_events + 1)); + struct tep_event_format **events = realloc(pevent->events, sizeof(event) * + (pevent->nr_events + 1)); if (!events) return -1; @@ -769,20 +771,20 @@ static int add_event(struct tep_handle *pevent, struct event_format *event) return 0; } -static int event_item_type(enum event_type type) +static int event_item_type(enum tep_event_type type) { switch (type) { - case EVENT_ITEM ... EVENT_SQUOTE: + case TEP_EVENT_ITEM ... TEP_EVENT_SQUOTE: return 1; - case EVENT_ERROR ... EVENT_DELIM: + case TEP_EVENT_ERROR ... TEP_EVENT_DELIM: default: return 0; } } -static void free_flag_sym(struct print_flag_sym *fsym) +static void free_flag_sym(struct tep_print_flag_sym *fsym) { - struct print_flag_sym *next; + struct tep_print_flag_sym *next; while (fsym) { next = fsym->next; @@ -793,60 +795,60 @@ static void free_flag_sym(struct print_flag_sym *fsym) } } -static void free_arg(struct print_arg *arg) +static void free_arg(struct tep_print_arg *arg) { - struct print_arg *farg; + struct tep_print_arg *farg; if (!arg) return; switch (arg->type) { - case PRINT_ATOM: + case TEP_PRINT_ATOM: free(arg->atom.atom); break; - case PRINT_FIELD: + case TEP_PRINT_FIELD: free(arg->field.name); break; - case PRINT_FLAGS: + case TEP_PRINT_FLAGS: free_arg(arg->flags.field); free(arg->flags.delim); free_flag_sym(arg->flags.flags); break; - case PRINT_SYMBOL: + case TEP_PRINT_SYMBOL: free_arg(arg->symbol.field); free_flag_sym(arg->symbol.symbols); break; - case PRINT_HEX: - case PRINT_HEX_STR: + case TEP_PRINT_HEX: + case TEP_PRINT_HEX_STR: free_arg(arg->hex.field); free_arg(arg->hex.size); break; - case PRINT_INT_ARRAY: + case TEP_PRINT_INT_ARRAY: free_arg(arg->int_array.field); free_arg(arg->int_array.count); free_arg(arg->int_array.el_size); break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: free(arg->typecast.type); free_arg(arg->typecast.item); break; - case PRINT_STRING: - case PRINT_BSTRING: + case TEP_PRINT_STRING: + case TEP_PRINT_BSTRING: free(arg->string.string); break; - case PRINT_BITMASK: + case TEP_PRINT_BITMASK: free(arg->bitmask.bitmask); break; - case PRINT_DYNAMIC_ARRAY: - case PRINT_DYNAMIC_ARRAY_LEN: + case TEP_PRINT_DYNAMIC_ARRAY: + case TEP_PRINT_DYNAMIC_ARRAY_LEN: free(arg->dynarray.index); break; - case PRINT_OP: + case TEP_PRINT_OP: free(arg->op.op); free_arg(arg->op.left); free_arg(arg->op.right); break; - case PRINT_FUNC: + case TEP_PRINT_FUNC: while (arg->func.args) { farg = arg->func.args; arg->func.args = farg->next; @@ -854,7 +856,7 @@ static void free_arg(struct print_arg *arg) } break; - case PRINT_NULL: + case TEP_PRINT_NULL: default: break; } @@ -862,24 +864,24 @@ static void free_arg(struct print_arg *arg) free(arg); } -static enum event_type get_type(int ch) +static enum tep_event_type get_type(int ch) { if (ch == '\n') - return EVENT_NEWLINE; + return TEP_EVENT_NEWLINE; if (isspace(ch)) - return EVENT_SPACE; + return TEP_EVENT_SPACE; if (isalnum(ch) || ch == '_') - return EVENT_ITEM; + return TEP_EVENT_ITEM; if (ch == '\'') - return EVENT_SQUOTE; + return TEP_EVENT_SQUOTE; if (ch == '"') - return EVENT_DQUOTE; + return TEP_EVENT_DQUOTE; if (!isprint(ch)) - return EVENT_NONE; + return TEP_EVENT_NONE; if (ch == '(' || ch == ')' || ch == ',') - return EVENT_DELIM; + return TEP_EVENT_DELIM; - return EVENT_OP; + return TEP_EVENT_OP; } static int __read_char(void) @@ -927,38 +929,38 @@ static int extend_token(char **tok, char *buf, int size) return 0; } -static enum event_type force_token(const char *str, char **tok); +static enum tep_event_type force_token(const char *str, char **tok); -static enum event_type __read_token(char **tok) +static enum tep_event_type __read_token(char **tok) { char buf[BUFSIZ]; int ch, last_ch, quote_ch, next_ch; int i = 0; int tok_size = 0; - enum event_type type; + enum tep_event_type type; *tok = NULL; ch = __read_char(); if (ch < 0) - return EVENT_NONE; + return TEP_EVENT_NONE; type = get_type(ch); - if (type == EVENT_NONE) + if (type == TEP_EVENT_NONE) return type; buf[i++] = ch; switch (type) { - case EVENT_NEWLINE: - case EVENT_DELIM: + case TEP_EVENT_NEWLINE: + case TEP_EVENT_DELIM: if (asprintf(tok, "%c", ch) < 0) - return EVENT_ERROR; + return TEP_EVENT_ERROR; return type; - case EVENT_OP: + case TEP_EVENT_OP: switch (ch) { case '-': next_ch = __peek_char(); @@ -1001,8 +1003,8 @@ static enum event_type __read_token(char **tok) buf[i++] = __read_char(); goto out; - case EVENT_DQUOTE: - case EVENT_SQUOTE: + case TEP_EVENT_DQUOTE: + case TEP_EVENT_SQUOTE: /* don't keep quotes */ i--; quote_ch = ch; @@ -1014,7 +1016,7 @@ static enum event_type __read_token(char **tok) tok_size += BUFSIZ; if (extend_token(tok, buf, tok_size) < 0) - return EVENT_NONE; + return TEP_EVENT_NONE; i = 0; } last_ch = ch; @@ -1031,7 +1033,7 @@ static enum event_type __read_token(char **tok) * For strings (double quotes) check the next token. * If it is another string, concatinate the two. */ - if (type == EVENT_DQUOTE) { + if (type == TEP_EVENT_DQUOTE) { unsigned long long save_input_buf_ptr = input_buf_ptr; do { @@ -1044,8 +1046,8 @@ static enum event_type __read_token(char **tok) goto out; - case EVENT_ERROR ... EVENT_SPACE: - case EVENT_ITEM: + case TEP_EVENT_ERROR ... TEP_EVENT_SPACE: + case TEP_EVENT_ITEM: default: break; } @@ -1056,7 +1058,7 @@ static enum event_type __read_token(char **tok) tok_size += BUFSIZ; if (extend_token(tok, buf, tok_size) < 0) - return EVENT_NONE; + return TEP_EVENT_NONE; i = 0; } ch = __read_char(); @@ -1066,9 +1068,9 @@ static enum event_type __read_token(char **tok) out: buf[i] = 0; if (extend_token(tok, buf, tok_size + i + 1) < 0) - return EVENT_NONE; + return TEP_EVENT_NONE; - if (type == EVENT_ITEM) { + if (type == TEP_EVENT_ITEM) { /* * Older versions of the kernel has a bug that * creates invalid symbols and will break the mac80211 @@ -1095,12 +1097,12 @@ static enum event_type __read_token(char **tok) return type; } -static enum event_type force_token(const char *str, char **tok) +static enum tep_event_type force_token(const char *str, char **tok) { const char *save_input_buf; unsigned long long save_input_buf_ptr; unsigned long long save_input_buf_siz; - enum event_type type; + enum tep_event_type type; /* save off the current input pointers */ save_input_buf = input_buf; @@ -1125,13 +1127,13 @@ static void free_token(char *tok) free(tok); } -static enum event_type read_token(char **tok) +static enum tep_event_type read_token(char **tok) { - enum event_type type; + enum tep_event_type type; for (;;) { type = __read_token(tok); - if (type != EVENT_SPACE) + if (type != TEP_EVENT_SPACE) return type; free_token(*tok); @@ -1139,7 +1141,7 @@ static enum event_type read_token(char **tok) /* not reached */ *tok = NULL; - return EVENT_NONE; + return TEP_EVENT_NONE; } /** @@ -1151,7 +1153,7 @@ static enum event_type read_token(char **tok) * * Returns the token type. */ -enum event_type tep_read_token(char **tok) +enum tep_event_type tep_read_token(char **tok) { return read_token(tok); } @@ -1166,13 +1168,13 @@ void tep_free_token(char *token) } /* no newline */ -static enum event_type read_token_item(char **tok) +static enum tep_event_type read_token_item(char **tok) { - enum event_type type; + enum tep_event_type type; for (;;) { type = __read_token(tok); - if (type != EVENT_SPACE && type != EVENT_NEWLINE) + if (type != TEP_EVENT_SPACE && type != TEP_EVENT_NEWLINE) return type; free_token(*tok); *tok = NULL; @@ -1180,10 +1182,10 @@ static enum event_type read_token_item(char **tok) /* not reached */ *tok = NULL; - return EVENT_NONE; + return TEP_EVENT_NONE; } -static int test_type(enum event_type type, enum event_type expect) +static int test_type(enum tep_event_type type, enum tep_event_type expect) { if (type != expect) { do_warning("Error: expected type %d but read %d", @@ -1193,8 +1195,8 @@ static int test_type(enum event_type type, enum event_type expect) return 0; } -static int test_type_token(enum event_type type, const char *token, - enum event_type expect, const char *expect_tok) +static int test_type_token(enum tep_event_type type, const char *token, + enum tep_event_type expect, const char *expect_tok) { if (type != expect) { do_warning("Error: expected type %d but read %d", @@ -1210,9 +1212,9 @@ static int test_type_token(enum event_type type, const char *token, return 0; } -static int __read_expect_type(enum event_type expect, char **tok, int newline_ok) +static int __read_expect_type(enum tep_event_type expect, char **tok, int newline_ok) { - enum event_type type; + enum tep_event_type type; if (newline_ok) type = read_token(tok); @@ -1221,15 +1223,15 @@ static int __read_expect_type(enum event_type expect, char **tok, int newline_ok return test_type(type, expect); } -static int read_expect_type(enum event_type expect, char **tok) +static int read_expect_type(enum tep_event_type expect, char **tok) { return __read_expect_type(expect, tok, 1); } -static int __read_expected(enum event_type expect, const char *str, +static int __read_expected(enum tep_event_type expect, const char *str, int newline_ok) { - enum event_type type; + enum tep_event_type type; char *token; int ret; @@ -1245,12 +1247,12 @@ static int __read_expected(enum event_type expect, const char *str, return ret; } -static int read_expected(enum event_type expect, const char *str) +static int read_expected(enum tep_event_type expect, const char *str) { return __read_expected(expect, str, 1); } -static int read_expected_item(enum event_type expect, const char *str) +static int read_expected_item(enum tep_event_type expect, const char *str) { return __read_expected(expect, str, 0); } @@ -1259,13 +1261,13 @@ static char *event_read_name(void) { char *token; - if (read_expected(EVENT_ITEM, "name") < 0) + if (read_expected(TEP_EVENT_ITEM, "name") < 0) return NULL; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return NULL; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; return token; @@ -1280,13 +1282,13 @@ static int event_read_id(void) char *token; int id; - if (read_expected_item(EVENT_ITEM, "ID") < 0) + if (read_expected_item(TEP_EVENT_ITEM, "ID") < 0) return -1; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return -1; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; id = strtoul(token, NULL, 0); @@ -1298,9 +1300,9 @@ static int event_read_id(void) return -1; } -static int field_is_string(struct format_field *field) +static int field_is_string(struct tep_format_field *field) { - if ((field->flags & FIELD_IS_ARRAY) && + if ((field->flags & TEP_FIELD_IS_ARRAY) && (strstr(field->type, "char") || strstr(field->type, "u8") || strstr(field->type, "s8"))) return 1; @@ -1308,7 +1310,7 @@ static int field_is_string(struct format_field *field) return 0; } -static int field_is_dynamic(struct format_field *field) +static int field_is_dynamic(struct tep_format_field *field) { if (strncmp(field->type, "__data_loc", 10) == 0) return 1; @@ -1316,7 +1318,7 @@ static int field_is_dynamic(struct format_field *field) return 0; } -static int field_is_long(struct format_field *field) +static int field_is_long(struct tep_format_field *field) { /* includes long long */ if (strstr(field->type, "long")) @@ -1327,7 +1329,7 @@ static int field_is_long(struct format_field *field) static unsigned int type_size(const char *name) { - /* This covers all FIELD_IS_STRING types. */ + /* This covers all TEP_FIELD_IS_STRING types. */ static struct { const char *type; unsigned int size; @@ -1353,10 +1355,10 @@ static unsigned int type_size(const char *name) return 0; } -static int event_read_fields(struct event_format *event, struct format_field **fields) +static int event_read_fields(struct tep_event_format *event, struct tep_format_field **fields) { - struct format_field *field = NULL; - enum event_type type; + struct tep_format_field *field = NULL; + enum tep_event_type type; char *token; char *last_token; int count = 0; @@ -1365,14 +1367,14 @@ static int event_read_fields(struct event_format *event, struct format_field **f unsigned int size_dynamic = 0; type = read_token(&token); - if (type == EVENT_NEWLINE) { + if (type == TEP_EVENT_NEWLINE) { free_token(token); return count; } count++; - if (test_type_token(type, token, EVENT_ITEM, "field")) + if (test_type_token(type, token, TEP_EVENT_ITEM, "field")) goto fail; free_token(token); @@ -1381,17 +1383,17 @@ static int event_read_fields(struct event_format *event, struct format_field **f * The ftrace fields may still use the "special" name. * Just ignore it. */ - if (event->flags & EVENT_FL_ISFTRACE && - type == EVENT_ITEM && strcmp(token, "special") == 0) { + if (event->flags & TEP_EVENT_FL_ISFTRACE && + type == TEP_EVENT_ITEM && strcmp(token, "special") == 0) { free_token(token); type = read_token(&token); } - if (test_type_token(type, token, EVENT_OP, ":") < 0) + if (test_type_token(type, token, TEP_EVENT_OP, ":") < 0) goto fail; free_token(token); - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; last_token = token; @@ -1405,17 +1407,17 @@ static int event_read_fields(struct event_format *event, struct format_field **f /* read the rest of the type */ for (;;) { type = read_token(&token); - if (type == EVENT_ITEM || - (type == EVENT_OP && strcmp(token, "*") == 0) || + if (type == TEP_EVENT_ITEM || + (type == TEP_EVENT_OP && strcmp(token, "*") == 0) || /* * Some of the ftrace fields are broken and have * an illegal "." in them. */ - (event->flags & EVENT_FL_ISFTRACE && - type == EVENT_OP && strcmp(token, ".") == 0)) { + (event->flags & TEP_EVENT_FL_ISFTRACE && + type == TEP_EVENT_OP && strcmp(token, ".") == 0)) { if (strcmp(token, "*") == 0) - field->flags |= FIELD_IS_POINTER; + field->flags |= TEP_FIELD_IS_POINTER; if (field->type) { char *new_type; @@ -1445,27 +1447,27 @@ static int event_read_fields(struct event_format *event, struct format_field **f } field->name = field->alias = last_token; - if (test_type(type, EVENT_OP)) + if (test_type(type, TEP_EVENT_OP)) goto fail; if (strcmp(token, "[") == 0) { - enum event_type last_type = type; + enum tep_event_type last_type = type; char *brackets = token; char *new_brackets; int len; - field->flags |= FIELD_IS_ARRAY; + field->flags |= TEP_FIELD_IS_ARRAY; type = read_token(&token); - if (type == EVENT_ITEM) + if (type == TEP_EVENT_ITEM) field->arraylen = strtoul(token, NULL, 0); else field->arraylen = 0; while (strcmp(token, "]") != 0) { - if (last_type == EVENT_ITEM && - type == EVENT_ITEM) + if (last_type == TEP_EVENT_ITEM && + type == TEP_EVENT_ITEM) len = 2; else len = 1; @@ -1486,7 +1488,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f field->arraylen = strtoul(token, NULL, 0); free_token(token); type = read_token(&token); - if (type == EVENT_NONE) { + if (type == TEP_EVENT_NONE) { do_warning_event(event, "failed to find token"); goto fail; } @@ -1509,7 +1511,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f * If the next token is not an OP, then it is of * the format: type [] item; */ - if (type == EVENT_ITEM) { + if (type == TEP_EVENT_ITEM) { char *new_type; new_type = realloc(field->type, strlen(field->type) + @@ -1543,79 +1545,79 @@ static int event_read_fields(struct event_format *event, struct format_field **f } if (field_is_string(field)) - field->flags |= FIELD_IS_STRING; + field->flags |= TEP_FIELD_IS_STRING; if (field_is_dynamic(field)) - field->flags |= FIELD_IS_DYNAMIC; + field->flags |= TEP_FIELD_IS_DYNAMIC; if (field_is_long(field)) - field->flags |= FIELD_IS_LONG; + field->flags |= TEP_FIELD_IS_LONG; - if (test_type_token(type, token, EVENT_OP, ";")) + if (test_type_token(type, token, TEP_EVENT_OP, ";")) goto fail; free_token(token); - if (read_expected(EVENT_ITEM, "offset") < 0) + if (read_expected(TEP_EVENT_ITEM, "offset") < 0) goto fail_expect; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) goto fail_expect; - if (read_expect_type(EVENT_ITEM, &token)) + if (read_expect_type(TEP_EVENT_ITEM, &token)) goto fail; field->offset = strtoul(token, NULL, 0); free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) goto fail_expect; - if (read_expected(EVENT_ITEM, "size") < 0) + if (read_expected(TEP_EVENT_ITEM, "size") < 0) goto fail_expect; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) goto fail_expect; - if (read_expect_type(EVENT_ITEM, &token)) + if (read_expect_type(TEP_EVENT_ITEM, &token)) goto fail; field->size = strtoul(token, NULL, 0); free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) goto fail_expect; type = read_token(&token); - if (type != EVENT_NEWLINE) { + if (type != TEP_EVENT_NEWLINE) { /* newer versions of the kernel have a "signed" type */ - if (test_type_token(type, token, EVENT_ITEM, "signed")) + if (test_type_token(type, token, TEP_EVENT_ITEM, "signed")) goto fail; free_token(token); - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) goto fail_expect; - if (read_expect_type(EVENT_ITEM, &token)) + if (read_expect_type(TEP_EVENT_ITEM, &token)) goto fail; if (strtoul(token, NULL, 0)) - field->flags |= FIELD_IS_SIGNED; + field->flags |= TEP_FIELD_IS_SIGNED; free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) goto fail_expect; - if (read_expect_type(EVENT_NEWLINE, &token)) + if (read_expect_type(TEP_EVENT_NEWLINE, &token)) goto fail; } free_token(token); - if (field->flags & FIELD_IS_ARRAY) { + if (field->flags & TEP_FIELD_IS_ARRAY) { if (field->arraylen) field->elementsize = field->size / field->arraylen; - else if (field->flags & FIELD_IS_DYNAMIC) + else if (field->flags & TEP_FIELD_IS_DYNAMIC) field->elementsize = size_dynamic; - else if (field->flags & FIELD_IS_STRING) + else if (field->flags & TEP_FIELD_IS_STRING) field->elementsize = 1; - else if (field->flags & FIELD_IS_LONG) + else if (field->flags & TEP_FIELD_IS_LONG) field->elementsize = event->pevent ? event->pevent->long_size : sizeof(long); @@ -1640,18 +1642,18 @@ fail_expect: return -1; } -static int event_read_format(struct event_format *event) +static int event_read_format(struct tep_event_format *event) { char *token; int ret; - if (read_expected_item(EVENT_ITEM, "format") < 0) + if (read_expected_item(TEP_EVENT_ITEM, "format") < 0) return -1; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return -1; - if (read_expect_type(EVENT_NEWLINE, &token)) + if (read_expect_type(TEP_EVENT_NEWLINE, &token)) goto fail; free_token(token); @@ -1672,14 +1674,14 @@ static int event_read_format(struct event_format *event) return -1; } -static enum event_type -process_arg_token(struct event_format *event, struct print_arg *arg, - char **tok, enum event_type type); +static enum tep_event_type +process_arg_token(struct tep_event_format *event, struct tep_print_arg *arg, + char **tok, enum tep_event_type type); -static enum event_type -process_arg(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_arg(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - enum event_type type; + enum tep_event_type type; char *token; type = read_token(&token); @@ -1688,32 +1690,32 @@ process_arg(struct event_format *event, struct print_arg *arg, char **tok) return process_arg_token(event, arg, tok, type); } -static enum event_type -process_op(struct event_format *event, struct print_arg *arg, char **tok); +static enum tep_event_type +process_op(struct tep_event_format *event, struct tep_print_arg *arg, char **tok); /* * For __print_symbolic() and __print_flags, we need to completely * evaluate the first argument, which defines what to print next. */ -static enum event_type -process_field_arg(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_field_arg(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - enum event_type type; + enum tep_event_type type; type = process_arg(event, arg, tok); - while (type == EVENT_OP) { + while (type == TEP_EVENT_OP) { type = process_op(event, arg, tok); } return type; } -static enum event_type -process_cond(struct event_format *event, struct print_arg *top, char **tok) +static enum tep_event_type +process_cond(struct tep_event_format *event, struct tep_print_arg *top, char **tok) { - struct print_arg *arg, *left, *right; - enum event_type type; + struct tep_print_arg *arg, *left, *right; + enum tep_event_type type; char *token = NULL; arg = alloc_arg(); @@ -1728,7 +1730,7 @@ process_cond(struct event_format *event, struct print_arg *top, char **tok) goto out_free; } - arg->type = PRINT_OP; + arg->type = TEP_PRINT_OP; arg->op.left = left; arg->op.right = right; @@ -1736,16 +1738,16 @@ process_cond(struct event_format *event, struct print_arg *top, char **tok) type = process_arg(event, left, &token); again: - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) goto out_free; /* Handle other operations in the arguments */ - if (type == EVENT_OP && strcmp(token, ":") != 0) { + if (type == TEP_EVENT_OP && strcmp(token, ":") != 0) { type = process_op(event, left, &token); goto again; } - if (test_type_token(type, token, EVENT_OP, ":")) + if (test_type_token(type, token, TEP_EVENT_OP, ":")) goto out_free; arg->op.op = token; @@ -1762,14 +1764,14 @@ out_free: top->op.right = NULL; free_token(token); free_arg(arg); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_array(struct event_format *event, struct print_arg *top, char **tok) +static enum tep_event_type +process_array(struct tep_event_format *event, struct tep_print_arg *top, char **tok) { - struct print_arg *arg; - enum event_type type; + struct tep_print_arg *arg; + enum tep_event_type type; char *token = NULL; arg = alloc_arg(); @@ -1777,12 +1779,12 @@ process_array(struct event_format *event, struct print_arg *top, char **tok) do_warning_event(event, "%s: not enough memory!", __func__); /* '*tok' is set to top->op.op. No need to free. */ *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } *tok = NULL; type = process_arg(event, arg, &token); - if (test_type_token(type, token, EVENT_OP, "]")) + if (test_type_token(type, token, TEP_EVENT_OP, "]")) goto out_free; top->op.right = arg; @@ -1796,7 +1798,7 @@ process_array(struct event_format *event, struct print_arg *top, char **tok) out_free: free_token(token); free_arg(arg); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } static int get_op_prio(char *op) @@ -1854,11 +1856,11 @@ static int get_op_prio(char *op) } } -static int set_op_prio(struct print_arg *arg) +static int set_op_prio(struct tep_print_arg *arg) { /* single ops are the greatest */ - if (!arg->op.left || arg->op.left->type == PRINT_NULL) + if (!arg->op.left || arg->op.left->type == TEP_PRINT_NULL) arg->op.prio = 0; else arg->op.prio = get_op_prio(arg->op.op); @@ -1867,17 +1869,17 @@ static int set_op_prio(struct print_arg *arg) } /* Note, *tok does not get freed, but will most likely be saved */ -static enum event_type -process_op(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_op(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct print_arg *left, *right = NULL; - enum event_type type; + struct tep_print_arg *left, *right = NULL; + enum tep_event_type type; char *token; /* the op is passed in via tok */ token = *tok; - if (arg->type == PRINT_OP && !arg->op.left) { + if (arg->type == TEP_PRINT_OP && !arg->op.left) { /* handle single op */ if (token[1]) { do_warning_event(event, "bad op token %s", token); @@ -1900,7 +1902,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) if (!left) goto out_warn_free; - left->type = PRINT_NULL; + left->type = TEP_PRINT_NULL; arg->op.left = left; right = alloc_arg(); @@ -1922,7 +1924,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) /* copy the top arg to the left */ *left = *arg; - arg->type = PRINT_OP; + arg->type = TEP_PRINT_OP; arg->op.op = token; arg->op.left = left; arg->op.prio = 0; @@ -1956,13 +1958,13 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) /* copy the top arg to the left */ *left = *arg; - arg->type = PRINT_OP; + arg->type = TEP_PRINT_OP; arg->op.op = token; arg->op.left = left; arg->op.right = NULL; if (set_op_prio(arg) == -1) { - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; /* arg->op.op (= token) will be freed at out_free */ arg->op.op = NULL; goto out_free; @@ -1973,10 +1975,10 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) /* could just be a type pointer */ if ((strcmp(arg->op.op, "*") == 0) && - type == EVENT_DELIM && (strcmp(token, ")") == 0)) { + type == TEP_EVENT_DELIM && (strcmp(token, ")") == 0)) { char *new_atom; - if (left->type != PRINT_ATOM) { + if (left->type != TEP_PRINT_ATOM) { do_warning_event(event, "bad pointer type"); goto out_free; } @@ -1999,16 +2001,16 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) goto out_warn_free; type = process_arg_token(event, right, tok, type); - if (type == EVENT_ERROR) { + if (type == TEP_EVENT_ERROR) { free_arg(right); /* token was freed in process_arg_token() via *tok */ token = NULL; goto out_free; } - if (right->type == PRINT_OP && + if (right->type == TEP_PRINT_OP && get_op_prio(arg->op.op) < get_op_prio(right->op.op)) { - struct print_arg tmp; + struct tep_print_arg tmp; /* rotate ops according to the priority */ arg->op.right = right->op.left; @@ -2030,7 +2032,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) *left = *arg; - arg->type = PRINT_OP; + arg->type = TEP_PRINT_OP; arg->op.op = token; arg->op.left = left; @@ -2041,12 +2043,12 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) } else { do_warning_event(event, "unknown op '%s'", token); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; /* the arg is now the left side */ goto out_free; } - if (type == EVENT_OP && strcmp(*tok, ":") != 0) { + if (type == TEP_EVENT_OP && strcmp(*tok, ":") != 0) { int prio; /* higher prios need to be closer to the root */ @@ -2065,34 +2067,34 @@ out_warn_free: out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_entry(struct event_format *event __maybe_unused, struct print_arg *arg, +static enum tep_event_type +process_entry(struct tep_event_format *event __maybe_unused, struct tep_print_arg *arg, char **tok) { - enum event_type type; + enum tep_event_type type; char *field; char *token; - if (read_expected(EVENT_OP, "->") < 0) + if (read_expected(TEP_EVENT_OP, "->") < 0) goto out_err; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto out_free; field = token; - arg->type = PRINT_FIELD; + arg->type = TEP_PRINT_FIELD; arg->field.name = field; if (is_flag_field) { arg->field.field = tep_find_any_field(event, arg->field.name); - arg->field.field->flags |= FIELD_IS_FLAG; + arg->field.field->flags |= TEP_FIELD_IS_FLAG; is_flag_field = 0; } else if (is_symbolic_field) { arg->field.field = tep_find_any_field(event, arg->field.name); - arg->field.field->flags |= FIELD_IS_SYMBOLIC; + arg->field.field->flags |= TEP_FIELD_IS_SYMBOLIC; is_symbolic_field = 0; } @@ -2105,14 +2107,14 @@ process_entry(struct event_format *event __maybe_unused, struct print_arg *arg, free_token(token); out_err: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static int alloc_and_process_delim(struct event_format *event, char *next_token, - struct print_arg **print_arg) +static int alloc_and_process_delim(struct tep_event_format *event, char *next_token, + struct tep_print_arg **print_arg) { - struct print_arg *field; - enum event_type type; + struct tep_print_arg *field; + enum tep_event_type type; char *token; int ret = 0; @@ -2125,7 +2127,7 @@ static int alloc_and_process_delim(struct event_format *event, char *next_token, type = process_arg(event, field, &token); - if (test_type_token(type, token, EVENT_DELIM, next_token)) { + if (test_type_token(type, token, TEP_EVENT_DELIM, next_token)) { errno = EINVAL; ret = -1; free_arg(field); @@ -2140,7 +2142,7 @@ out_free_token: return ret; } -static char *arg_eval (struct print_arg *arg); +static char *arg_eval (struct tep_print_arg *arg); static unsigned long long eval_type_str(unsigned long long val, const char *type, int pointer) @@ -2237,9 +2239,9 @@ eval_type_str(unsigned long long val, const char *type, int pointer) * Try to figure out the type. */ static unsigned long long -eval_type(unsigned long long val, struct print_arg *arg, int pointer) +eval_type(unsigned long long val, struct tep_print_arg *arg, int pointer) { - if (arg->type != PRINT_TYPE) { + if (arg->type != TEP_PRINT_TYPE) { do_warning("expected type argument"); return 0; } @@ -2247,22 +2249,22 @@ eval_type(unsigned long long val, struct print_arg *arg, int pointer) return eval_type_str(val, arg->typecast.type, pointer); } -static int arg_num_eval(struct print_arg *arg, long long *val) +static int arg_num_eval(struct tep_print_arg *arg, long long *val) { long long left, right; int ret = 1; switch (arg->type) { - case PRINT_ATOM: + case TEP_PRINT_ATOM: *val = strtoll(arg->atom.atom, NULL, 0); break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: ret = arg_num_eval(arg->typecast.item, val); if (!ret) break; *val = eval_type(*val, arg, 0); break; - case PRINT_OP: + case TEP_PRINT_OP: switch (arg->op.op[0]) { case '|': ret = arg_num_eval(arg->op.left, &left); @@ -2365,7 +2367,7 @@ static int arg_num_eval(struct print_arg *arg, long long *val) break; case '-': /* check for negative */ - if (arg->op.left->type == PRINT_NULL) + if (arg->op.left->type == TEP_PRINT_NULL) left = 0; else ret = arg_num_eval(arg->op.left, &left); @@ -2377,7 +2379,7 @@ static int arg_num_eval(struct print_arg *arg, long long *val) *val = left - right; break; case '+': - if (arg->op.left->type == PRINT_NULL) + if (arg->op.left->type == TEP_PRINT_NULL) left = 0; else ret = arg_num_eval(arg->op.left, &left); @@ -2400,11 +2402,11 @@ static int arg_num_eval(struct print_arg *arg, long long *val) } break; - case PRINT_NULL: - case PRINT_FIELD ... PRINT_SYMBOL: - case PRINT_STRING: - case PRINT_BSTRING: - case PRINT_BITMASK: + case TEP_PRINT_NULL: + case TEP_PRINT_FIELD ... TEP_PRINT_SYMBOL: + case TEP_PRINT_STRING: + case TEP_PRINT_BSTRING: + case TEP_PRINT_BITMASK: default: do_warning("invalid eval type %d", arg->type); ret = 0; @@ -2413,27 +2415,27 @@ static int arg_num_eval(struct print_arg *arg, long long *val) return ret; } -static char *arg_eval (struct print_arg *arg) +static char *arg_eval (struct tep_print_arg *arg) { long long val; static char buf[20]; switch (arg->type) { - case PRINT_ATOM: + case TEP_PRINT_ATOM: return arg->atom.atom; - case PRINT_TYPE: + case TEP_PRINT_TYPE: return arg_eval(arg->typecast.item); - case PRINT_OP: + case TEP_PRINT_OP: if (!arg_num_eval(arg, &val)) break; sprintf(buf, "%lld", val); return buf; - case PRINT_NULL: - case PRINT_FIELD ... PRINT_SYMBOL: - case PRINT_STRING: - case PRINT_BSTRING: - case PRINT_BITMASK: + case TEP_PRINT_NULL: + case TEP_PRINT_FIELD ... TEP_PRINT_SYMBOL: + case TEP_PRINT_STRING: + case TEP_PRINT_BSTRING: + case TEP_PRINT_BITMASK: default: do_warning("invalid eval type %d", arg->type); break; @@ -2442,19 +2444,19 @@ static char *arg_eval (struct print_arg *arg) return NULL; } -static enum event_type -process_fields(struct event_format *event, struct print_flag_sym **list, char **tok) +static enum tep_event_type +process_fields(struct tep_event_format *event, struct tep_print_flag_sym **list, char **tok) { - enum event_type type; - struct print_arg *arg = NULL; - struct print_flag_sym *field; + enum tep_event_type type; + struct tep_print_arg *arg = NULL; + struct tep_print_flag_sym *field; char *token = *tok; char *value; do { free_token(token); type = read_token_item(&token); - if (test_type_token(type, token, EVENT_OP, "{")) + if (test_type_token(type, token, TEP_EVENT_OP, "{")) break; arg = alloc_arg(); @@ -2464,13 +2466,13 @@ process_fields(struct event_format *event, struct print_flag_sym **list, char ** free_token(token); type = process_arg(event, arg, &token); - if (type == EVENT_OP) + if (type == TEP_EVENT_OP) type = process_op(event, arg, &token); - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) goto out_free; - if (test_type_token(type, token, EVENT_DELIM, ",")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ",")) goto out_free; field = calloc(1, sizeof(*field)); @@ -2491,7 +2493,7 @@ process_fields(struct event_format *event, struct print_flag_sym **list, char ** free_token(token); type = process_arg(event, arg, &token); - if (test_type_token(type, token, EVENT_OP, "}")) + if (test_type_token(type, token, TEP_EVENT_OP, "}")) goto out_free_field; value = arg_eval(arg); @@ -2508,7 +2510,7 @@ process_fields(struct event_format *event, struct print_flag_sym **list, char ** free_token(token); type = read_token_item(&token); - } while (type == EVENT_DELIM && strcmp(token, ",") == 0); + } while (type == TEP_EVENT_DELIM && strcmp(token, ",") == 0); *tok = token; return type; @@ -2520,18 +2522,18 @@ out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_flags(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_flags(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct print_arg *field; - enum event_type type; + struct tep_print_arg *field; + enum tep_event_type type; char *token = NULL; memset(arg, 0, sizeof(*arg)); - arg->type = PRINT_FLAGS; + arg->type = TEP_PRINT_FLAGS; field = alloc_arg(); if (!field) { @@ -2542,10 +2544,10 @@ process_flags(struct event_format *event, struct print_arg *arg, char **tok) type = process_field_arg(event, field, &token); /* Handle operations in the first argument */ - while (type == EVENT_OP) + while (type == TEP_EVENT_OP) type = process_op(event, field, &token); - if (test_type_token(type, token, EVENT_DELIM, ",")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ",")) goto out_free_field; free_token(token); @@ -2557,11 +2559,11 @@ process_flags(struct event_format *event, struct print_arg *arg, char **tok) type = read_token_item(&token); } - if (test_type_token(type, token, EVENT_DELIM, ",")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ",")) goto out_free; type = process_fields(event, &arg->flags.flags, &token); - if (test_type_token(type, token, EVENT_DELIM, ")")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ")")) goto out_free; free_token(token); @@ -2573,18 +2575,18 @@ out_free_field: out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_symbols(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_symbols(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct print_arg *field; - enum event_type type; + struct tep_print_arg *field; + enum tep_event_type type; char *token = NULL; memset(arg, 0, sizeof(*arg)); - arg->type = PRINT_SYMBOL; + arg->type = TEP_PRINT_SYMBOL; field = alloc_arg(); if (!field) { @@ -2594,13 +2596,13 @@ process_symbols(struct event_format *event, struct print_arg *arg, char **tok) type = process_field_arg(event, field, &token); - if (test_type_token(type, token, EVENT_DELIM, ",")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ",")) goto out_free_field; arg->symbol.field = field; type = process_fields(event, &arg->symbol.symbols, &token); - if (test_type_token(type, token, EVENT_DELIM, ")")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ")")) goto out_free; free_token(token); @@ -2612,12 +2614,12 @@ out_free_field: out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_hex_common(struct event_format *event, struct print_arg *arg, - char **tok, enum print_arg_type type) +static enum tep_event_type +process_hex_common(struct tep_event_format *event, struct tep_print_arg *arg, + char **tok, enum tep_print_arg_type type) { memset(arg, 0, sizeof(*arg)); arg->type = type; @@ -2635,27 +2637,27 @@ free_field: arg->hex.field = NULL; out: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_hex(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_hex(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - return process_hex_common(event, arg, tok, PRINT_HEX); + return process_hex_common(event, arg, tok, TEP_PRINT_HEX); } -static enum event_type -process_hex_str(struct event_format *event, struct print_arg *arg, +static enum tep_event_type +process_hex_str(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - return process_hex_common(event, arg, tok, PRINT_HEX_STR); + return process_hex_common(event, arg, tok, TEP_PRINT_HEX_STR); } -static enum event_type -process_int_array(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_int_array(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { memset(arg, 0, sizeof(*arg)); - arg->type = PRINT_INT_ARRAY; + arg->type = TEP_PRINT_INT_ARRAY; if (alloc_and_process_delim(event, ",", &arg->int_array.field)) goto out; @@ -2676,18 +2678,18 @@ free_field: arg->int_array.field = NULL; out: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_dynamic_array(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_dynamic_array(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct format_field *field; - enum event_type type; + struct tep_format_field *field; + enum tep_event_type type; char *token; memset(arg, 0, sizeof(*arg)); - arg->type = PRINT_DYNAMIC_ARRAY; + arg->type = TEP_PRINT_DYNAMIC_ARRAY; /* * The item within the parenthesis is another field that holds @@ -2695,7 +2697,7 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char ** */ type = read_token(&token); *tok = token; - if (type != EVENT_ITEM) + if (type != TEP_EVENT_ITEM) goto out_free; /* Find the field */ @@ -2707,13 +2709,13 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char ** arg->dynarray.field = field; arg->dynarray.index = 0; - if (read_expected(EVENT_DELIM, ")") < 0) + if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_free; free_token(token); type = read_token_item(&token); *tok = token; - if (type != EVENT_OP || strcmp(token, "[") != 0) + if (type != TEP_EVENT_OP || strcmp(token, "[") != 0) return type; free_token(token); @@ -2721,14 +2723,14 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char ** if (!arg) { do_warning_event(event, "%s: not enough memory!", __func__); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } type = process_arg(event, arg, &token); - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) goto out_free_arg; - if (!test_type_token(type, token, EVENT_OP, "]")) + if (!test_type_token(type, token, TEP_EVENT_OP, "]")) goto out_free_arg; free_token(token); @@ -2740,21 +2742,21 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char ** out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_dynamic_array_len(struct event_format *event, struct print_arg *arg, +static enum tep_event_type +process_dynamic_array_len(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct format_field *field; - enum event_type type; + struct tep_format_field *field; + enum tep_event_type type; char *token; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto out_free; - arg->type = PRINT_DYNAMIC_ARRAY_LEN; + arg->type = TEP_PRINT_DYNAMIC_ARRAY_LEN; /* Find the field */ field = tep_find_field(event, token); @@ -2764,7 +2766,7 @@ process_dynamic_array_len(struct event_format *event, struct print_arg *arg, arg->dynarray.field = field; arg->dynarray.index = 0; - if (read_expected(EVENT_DELIM, ")") < 0) + if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_err; type = read_token(&token); @@ -2776,28 +2778,28 @@ process_dynamic_array_len(struct event_format *event, struct print_arg *arg, free_token(token); out_err: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_paren(struct event_format *event, struct print_arg *arg, char **tok) +static enum tep_event_type +process_paren(struct tep_event_format *event, struct tep_print_arg *arg, char **tok) { - struct print_arg *item_arg; - enum event_type type; + struct tep_print_arg *item_arg; + enum tep_event_type type; char *token; type = process_arg(event, arg, &token); - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) goto out_free; - if (type == EVENT_OP) + if (type == TEP_EVENT_OP) type = process_op(event, arg, &token); - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) goto out_free; - if (test_type_token(type, token, EVENT_DELIM, ")")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ")")) goto out_free; free_token(token); @@ -2808,13 +2810,13 @@ process_paren(struct event_format *event, struct print_arg *arg, char **tok) * this was a typecast. */ if (event_item_type(type) || - (type == EVENT_DELIM && strcmp(token, "(") == 0)) { + (type == TEP_EVENT_DELIM && strcmp(token, "(") == 0)) { /* make this a typecast and contine */ /* prevous must be an atom */ - if (arg->type != PRINT_ATOM) { - do_warning_event(event, "previous needed to be PRINT_ATOM"); + if (arg->type != TEP_PRINT_ATOM) { + do_warning_event(event, "previous needed to be TEP_PRINT_ATOM"); goto out_free; } @@ -2825,7 +2827,7 @@ process_paren(struct event_format *event, struct print_arg *arg, char **tok) goto out_free; } - arg->type = PRINT_TYPE; + arg->type = TEP_PRINT_TYPE; arg->typecast.type = arg->atom.atom; arg->typecast.item = item_arg; type = process_arg_token(event, item_arg, &token, type); @@ -2838,25 +2840,25 @@ process_paren(struct event_format *event, struct print_arg *arg, char **tok) out_free: free_token(token); *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_str(struct event_format *event __maybe_unused, struct print_arg *arg, +static enum tep_event_type +process_str(struct tep_event_format *event __maybe_unused, struct tep_print_arg *arg, char **tok) { - enum event_type type; + enum tep_event_type type; char *token; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto out_free; - arg->type = PRINT_STRING; + arg->type = TEP_PRINT_STRING; arg->string.string = token; arg->string.offset = -1; - if (read_expected(EVENT_DELIM, ")") < 0) + if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_err; type = read_token(&token); @@ -2868,24 +2870,24 @@ process_str(struct event_format *event __maybe_unused, struct print_arg *arg, free_token(token); out_err: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_bitmask(struct event_format *event __maybe_unused, struct print_arg *arg, - char **tok) +static enum tep_event_type +process_bitmask(struct tep_event_format *event __maybe_unused, struct tep_print_arg *arg, + char **tok) { - enum event_type type; + enum tep_event_type type; char *token; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto out_free; - arg->type = PRINT_BITMASK; + arg->type = TEP_PRINT_BITMASK; arg->bitmask.bitmask = token; arg->bitmask.offset = -1; - if (read_expected(EVENT_DELIM, ")") < 0) + if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_err; type = read_token(&token); @@ -2897,7 +2899,7 @@ process_bitmask(struct event_format *event __maybe_unused, struct print_arg *arg free_token(token); out_err: *tok = NULL; - return EVENT_ERROR; + return TEP_EVENT_ERROR; } static struct tep_function_handler * @@ -2932,17 +2934,17 @@ static void remove_func_handler(struct tep_handle *pevent, char *func_name) } } -static enum event_type -process_func_handler(struct event_format *event, struct tep_function_handler *func, - struct print_arg *arg, char **tok) +static enum tep_event_type +process_func_handler(struct tep_event_format *event, struct tep_function_handler *func, + struct tep_print_arg *arg, char **tok) { - struct print_arg **next_arg; - struct print_arg *farg; - enum event_type type; + struct tep_print_arg **next_arg; + struct tep_print_arg *farg; + enum tep_event_type type; char *token; int i; - arg->type = PRINT_FUNC; + arg->type = TEP_PRINT_FUNC; arg->func.func = func; *tok = NULL; @@ -2953,12 +2955,12 @@ process_func_handler(struct event_format *event, struct tep_function_handler *fu if (!farg) { do_warning_event(event, "%s: not enough memory!", __func__); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } type = process_arg(event, farg, &token); if (i < (func->nr_args - 1)) { - if (type != EVENT_DELIM || strcmp(token, ",") != 0) { + if (type != TEP_EVENT_DELIM || strcmp(token, ",") != 0) { do_warning_event(event, "Error: function '%s()' expects %d arguments but event %s only uses %d", func->name, func->nr_args, @@ -2966,7 +2968,7 @@ process_func_handler(struct event_format *event, struct tep_function_handler *fu goto err; } } else { - if (type != EVENT_DELIM || strcmp(token, ")") != 0) { + if (type != TEP_EVENT_DELIM || strcmp(token, ")") != 0) { do_warning_event(event, "Error: function '%s()' only expects %d arguments but event %s has more", func->name, func->nr_args, event->name); @@ -2987,11 +2989,11 @@ process_func_handler(struct event_format *event, struct tep_function_handler *fu err: free_arg(farg); free_token(token); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_function(struct event_format *event, struct print_arg *arg, +static enum tep_event_type +process_function(struct tep_event_format *event, struct tep_print_arg *arg, char *token, char **tok) { struct tep_function_handler *func; @@ -3043,12 +3045,12 @@ process_function(struct event_format *event, struct print_arg *arg, do_warning_event(event, "function %s not defined", token); free_token(token); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } -static enum event_type -process_arg_token(struct event_format *event, struct print_arg *arg, - char **tok, enum event_type type) +static enum tep_event_type +process_arg_token(struct tep_event_format *event, struct tep_print_arg *arg, + char **tok, enum tep_event_type type) { char *token; char *atom; @@ -3056,7 +3058,7 @@ process_arg_token(struct event_format *event, struct print_arg *arg, token = *tok; switch (type) { - case EVENT_ITEM: + case TEP_EVENT_ITEM: if (strcmp(token, "REC") == 0) { free_token(token); type = process_entry(event, arg, &token); @@ -3070,7 +3072,7 @@ process_arg_token(struct event_format *event, struct print_arg *arg, * If the next token is a parenthesis, then this * is a function. */ - if (type == EVENT_DELIM && strcmp(token, "(") == 0) { + if (type == TEP_EVENT_DELIM && strcmp(token, "(") == 0) { free_token(token); token = NULL; /* this will free atom. */ @@ -3078,7 +3080,7 @@ process_arg_token(struct event_format *event, struct print_arg *arg, break; } /* atoms can be more than one token long */ - while (type == EVENT_ITEM) { + while (type == TEP_EVENT_ITEM) { char *new_atom; new_atom = realloc(atom, strlen(atom) + strlen(token) + 2); @@ -3086,7 +3088,7 @@ process_arg_token(struct event_format *event, struct print_arg *arg, free(atom); *tok = NULL; free_token(token); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } atom = new_atom; strcat(atom, " "); @@ -3095,55 +3097,55 @@ process_arg_token(struct event_format *event, struct print_arg *arg, type = read_token_item(&token); } - arg->type = PRINT_ATOM; + arg->type = TEP_PRINT_ATOM; arg->atom.atom = atom; break; - case EVENT_DQUOTE: - case EVENT_SQUOTE: - arg->type = PRINT_ATOM; + case TEP_EVENT_DQUOTE: + case TEP_EVENT_SQUOTE: + arg->type = TEP_PRINT_ATOM; arg->atom.atom = token; type = read_token_item(&token); break; - case EVENT_DELIM: + case TEP_EVENT_DELIM: if (strcmp(token, "(") == 0) { free_token(token); type = process_paren(event, arg, &token); break; } - case EVENT_OP: + case TEP_EVENT_OP: /* handle single ops */ - arg->type = PRINT_OP; + arg->type = TEP_PRINT_OP; arg->op.op = token; arg->op.left = NULL; type = process_op(event, arg, &token); /* On error, the op is freed */ - if (type == EVENT_ERROR) + if (type == TEP_EVENT_ERROR) arg->op.op = NULL; /* return error type if errored */ break; - case EVENT_ERROR ... EVENT_NEWLINE: + case TEP_EVENT_ERROR ... TEP_EVENT_NEWLINE: default: do_warning_event(event, "unexpected type %d", type); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } *tok = token; return type; } -static int event_read_print_args(struct event_format *event, struct print_arg **list) +static int event_read_print_args(struct tep_event_format *event, struct tep_print_arg **list) { - enum event_type type = EVENT_ERROR; - struct print_arg *arg; + enum tep_event_type type = TEP_EVENT_ERROR; + struct tep_print_arg *arg; char *token; int args = 0; do { - if (type == EVENT_NEWLINE) { + if (type == TEP_EVENT_NEWLINE) { type = read_token_item(&token); continue; } @@ -3157,7 +3159,7 @@ static int event_read_print_args(struct event_format *event, struct print_arg ** type = process_arg(event, arg, &token); - if (type == EVENT_ERROR) { + if (type == TEP_EVENT_ERROR) { free_token(token); free_arg(arg); return -1; @@ -3166,10 +3168,10 @@ static int event_read_print_args(struct event_format *event, struct print_arg ** *list = arg; args++; - if (type == EVENT_OP) { + if (type == TEP_EVENT_OP) { type = process_op(event, arg, &token); free_token(token); - if (type == EVENT_ERROR) { + if (type == TEP_EVENT_ERROR) { *list = NULL; free_arg(arg); return -1; @@ -3178,37 +3180,37 @@ static int event_read_print_args(struct event_format *event, struct print_arg ** continue; } - if (type == EVENT_DELIM && strcmp(token, ",") == 0) { + if (type == TEP_EVENT_DELIM && strcmp(token, ",") == 0) { free_token(token); *list = arg; list = &arg->next; continue; } break; - } while (type != EVENT_NONE); + } while (type != TEP_EVENT_NONE); - if (type != EVENT_NONE && type != EVENT_ERROR) + if (type != TEP_EVENT_NONE && type != TEP_EVENT_ERROR) free_token(token); return args; } -static int event_read_print(struct event_format *event) +static int event_read_print(struct tep_event_format *event) { - enum event_type type; + enum tep_event_type type; char *token; int ret; - if (read_expected_item(EVENT_ITEM, "print") < 0) + if (read_expected_item(TEP_EVENT_ITEM, "print") < 0) return -1; - if (read_expected(EVENT_ITEM, "fmt") < 0) + if (read_expected(TEP_EVENT_ITEM, "fmt") < 0) return -1; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return -1; - if (read_expect_type(EVENT_DQUOTE, &token) < 0) + if (read_expect_type(TEP_EVENT_DQUOTE, &token) < 0) goto fail; concat: @@ -3218,11 +3220,11 @@ static int event_read_print(struct event_format *event) /* ok to have no arg */ type = read_token_item(&token); - if (type == EVENT_NONE) + if (type == TEP_EVENT_NONE) return 0; /* Handle concatenation of print lines */ - if (type == EVENT_DQUOTE) { + if (type == TEP_EVENT_DQUOTE) { char *cat; if (asprintf(&cat, "%s%s", event->print_fmt.format, token) < 0) @@ -3234,7 +3236,7 @@ static int event_read_print(struct event_format *event) goto concat; } - if (test_type_token(type, token, EVENT_DELIM, ",")) + if (test_type_token(type, token, TEP_EVENT_DELIM, ",")) goto fail; free_token(token); @@ -3258,10 +3260,10 @@ static int event_read_print(struct event_format *event) * Returns a common field from the event by the given @name. * This only searchs the common fields and not all field. */ -struct format_field * -tep_find_common_field(struct event_format *event, const char *name) +struct tep_format_field * +tep_find_common_field(struct tep_event_format *event, const char *name) { - struct format_field *format; + struct tep_format_field *format; for (format = event->format.common_fields; format; format = format->next) { @@ -3280,10 +3282,10 @@ tep_find_common_field(struct event_format *event, const char *name) * Returns a non-common field by the given @name. * This does not search common fields. */ -struct format_field * -tep_find_field(struct event_format *event, const char *name) +struct tep_format_field * +tep_find_field(struct tep_event_format *event, const char *name) { - struct format_field *format; + struct tep_format_field *format; for (format = event->format.fields; format; format = format->next) { @@ -3303,10 +3305,10 @@ tep_find_field(struct event_format *event, const char *name) * This searchs the common field names first, then * the non-common ones if a common one was not found. */ -struct format_field * -tep_find_any_field(struct event_format *event, const char *name) +struct tep_format_field * +tep_find_any_field(struct tep_event_format *event, const char *name) { - struct format_field *format; + struct tep_format_field *format; format = tep_find_common_field(event, name); if (format) @@ -3330,11 +3332,11 @@ unsigned long long tep_read_number(struct tep_handle *pevent, case 1: return *(unsigned char *)ptr; case 2: - return data2host2(pevent, ptr); + return tep_data2host2(pevent, ptr); case 4: - return data2host4(pevent, ptr); + return tep_data2host4(pevent, ptr); case 8: - return data2host8(pevent, ptr); + return tep_data2host8(pevent, ptr); default: /* BUG! */ return 0; @@ -3352,7 +3354,7 @@ unsigned long long tep_read_number(struct tep_handle *pevent, * * Returns 0 on success, -1 otherwise. */ -int tep_read_number_field(struct format_field *field, const void *data, +int tep_read_number_field(struct tep_format_field *field, const void *data, unsigned long long *value) { if (!field) @@ -3373,8 +3375,8 @@ int tep_read_number_field(struct format_field *field, const void *data, static int get_common_info(struct tep_handle *pevent, const char *type, int *offset, int *size) { - struct event_format *event; - struct format_field *field; + struct tep_event_format *event; + struct tep_format_field *field; /* * All events should have the same common elements. @@ -3460,11 +3462,11 @@ static int events_id_cmp(const void *a, const void *b); * * Returns an event that has a given @id. */ -struct event_format *tep_find_event(struct tep_handle *pevent, int id) +struct tep_event_format *tep_find_event(struct tep_handle *pevent, int id) { - struct event_format **eventptr; - struct event_format key; - struct event_format *pkey = &key; + struct tep_event_format **eventptr; + struct tep_event_format key; + struct tep_event_format *pkey = &key; /* Check cache first */ if (pevent->last_event && pevent->last_event->id == id) @@ -3492,11 +3494,11 @@ struct event_format *tep_find_event(struct tep_handle *pevent, int id) * This returns an event with a given @name and under the system * @sys. If @sys is NULL the first event with @name is returned. */ -struct event_format * +struct tep_event_format * tep_find_event_by_name(struct tep_handle *pevent, const char *sys, const char *name) { - struct event_format *event; + struct tep_event_format *event; int i; if (pevent->last_event && @@ -3521,23 +3523,23 @@ tep_find_event_by_name(struct tep_handle *pevent, } static unsigned long long -eval_num_arg(void *data, int size, struct event_format *event, struct print_arg *arg) +eval_num_arg(void *data, int size, struct tep_event_format *event, struct tep_print_arg *arg) { struct tep_handle *pevent = event->pevent; unsigned long long val = 0; unsigned long long left, right; - struct print_arg *typearg = NULL; - struct print_arg *larg; + struct tep_print_arg *typearg = NULL; + struct tep_print_arg *larg; unsigned long offset; unsigned int field_size; switch (arg->type) { - case PRINT_NULL: + case TEP_PRINT_NULL: /* ?? */ return 0; - case PRINT_ATOM: + case TEP_PRINT_ATOM: return strtoull(arg->atom.atom, NULL, 0); - case PRINT_FIELD: + case TEP_PRINT_FIELD: if (!arg->field.field) { arg->field.field = tep_find_any_field(event, arg->field.name); if (!arg->field.field) @@ -3548,27 +3550,27 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg val = tep_read_number(pevent, data + arg->field.field->offset, arg->field.field->size); break; - case PRINT_FLAGS: - case PRINT_SYMBOL: - case PRINT_INT_ARRAY: - case PRINT_HEX: - case PRINT_HEX_STR: + case TEP_PRINT_FLAGS: + case TEP_PRINT_SYMBOL: + case TEP_PRINT_INT_ARRAY: + case TEP_PRINT_HEX: + case TEP_PRINT_HEX_STR: break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: val = eval_num_arg(data, size, event, arg->typecast.item); return eval_type(val, arg, 0); - case PRINT_STRING: - case PRINT_BSTRING: - case PRINT_BITMASK: + case TEP_PRINT_STRING: + case TEP_PRINT_BSTRING: + case TEP_PRINT_BITMASK: return 0; - case PRINT_FUNC: { + case TEP_PRINT_FUNC: { struct trace_seq s; trace_seq_init(&s); val = process_defined_func(&s, data, size, event, arg); trace_seq_destroy(&s); return val; } - case PRINT_OP: + case TEP_PRINT_OP: if (strcmp(arg->op.op, "[") == 0) { /* * Arrays are special, since we don't want @@ -3578,7 +3580,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg /* handle typecasts */ larg = arg->op.left; - while (larg->type == PRINT_TYPE) { + while (larg->type == TEP_PRINT_TYPE) { if (!typearg) typearg = larg; larg = larg->typecast.item; @@ -3588,7 +3590,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg field_size = pevent->long_size; switch (larg->type) { - case PRINT_DYNAMIC_ARRAY: + case TEP_PRINT_DYNAMIC_ARRAY: offset = tep_read_number(pevent, data + larg->dynarray.field->offset, larg->dynarray.field->size); @@ -3602,7 +3604,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg offset &= 0xffff; offset += right; break; - case PRINT_FIELD: + case TEP_PRINT_FIELD: if (!larg->field.field) { larg->field.field = tep_find_any_field(event, larg->field.name); @@ -3718,7 +3720,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg goto out_warning_op; } break; - case PRINT_DYNAMIC_ARRAY_LEN: + case TEP_PRINT_DYNAMIC_ARRAY_LEN: offset = tep_read_number(pevent, data + arg->dynarray.field->offset, arg->dynarray.field->size); @@ -3729,7 +3731,7 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg */ val = (unsigned long long)(offset >> 16); break; - case PRINT_DYNAMIC_ARRAY: + case TEP_PRINT_DYNAMIC_ARRAY: /* Without [], we pass the address to the dynamic data */ offset = tep_read_number(pevent, data + arg->dynarray.field->offset, @@ -3861,12 +3863,12 @@ static void print_bitmask_to_seq(struct tep_handle *pevent, } static void print_str_arg(struct trace_seq *s, void *data, int size, - struct event_format *event, const char *format, - int len_arg, struct print_arg *arg) + struct tep_event_format *event, const char *format, + int len_arg, struct tep_print_arg *arg) { struct tep_handle *pevent = event->pevent; - struct print_flag_sym *flag; - struct format_field *field; + struct tep_print_flag_sym *flag; + struct tep_format_field *field; struct printk_map *printk; long long val, fval; unsigned long long addr; @@ -3876,13 +3878,13 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, int i, len; switch (arg->type) { - case PRINT_NULL: + case TEP_PRINT_NULL: /* ?? */ return; - case PRINT_ATOM: + case TEP_PRINT_ATOM: print_str_to_seq(s, format, len_arg, arg->atom.atom); return; - case PRINT_FIELD: + case TEP_PRINT_FIELD: field = arg->field.field; if (!field) { field = tep_find_any_field(event, arg->field.name); @@ -3900,7 +3902,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, * and the size is the same as long_size, assume that it * is a pointer. */ - if (!(field->flags & FIELD_IS_ARRAY) && + if (!(field->flags & TEP_FIELD_IS_ARRAY) && field->size == pevent->long_size) { /* Handle heterogeneous recording and processing @@ -3939,7 +3941,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, print_str_to_seq(s, format, len_arg, str); free(str); break; - case PRINT_FLAGS: + case TEP_PRINT_FLAGS: val = eval_num_arg(data, size, event, arg->flags.field); print = 0; for (flag = arg->flags.flags; flag; flag = flag->next) { @@ -3962,7 +3964,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, trace_seq_printf(s, "0x%llx", val); } break; - case PRINT_SYMBOL: + case TEP_PRINT_SYMBOL: val = eval_num_arg(data, size, event, arg->symbol.field); for (flag = arg->symbol.symbols; flag; flag = flag->next) { fval = eval_flag(flag->value); @@ -3974,9 +3976,9 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, if (!flag) trace_seq_printf(s, "0x%llx", val); break; - case PRINT_HEX: - case PRINT_HEX_STR: - if (arg->hex.field->type == PRINT_DYNAMIC_ARRAY) { + case TEP_PRINT_HEX: + case TEP_PRINT_HEX_STR: + if (arg->hex.field->type == TEP_PRINT_DYNAMIC_ARRAY) { unsigned long offset; offset = tep_read_number(pevent, data + arg->hex.field->dynarray.field->offset, @@ -3995,19 +3997,19 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, } len = eval_num_arg(data, size, event, arg->hex.size); for (i = 0; i < len; i++) { - if (i && arg->type == PRINT_HEX) + if (i && arg->type == TEP_PRINT_HEX) trace_seq_putc(s, ' '); trace_seq_printf(s, "%02x", hex[i]); } break; - case PRINT_INT_ARRAY: { + case TEP_PRINT_INT_ARRAY: { void *num; int el_size; - if (arg->int_array.field->type == PRINT_DYNAMIC_ARRAY) { + if (arg->int_array.field->type == TEP_PRINT_DYNAMIC_ARRAY) { unsigned long offset; - struct format_field *field = + struct tep_format_field *field = arg->int_array.field->dynarray.field; offset = tep_read_number(pevent, data + field->offset, @@ -4049,43 +4051,43 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, } break; } - case PRINT_TYPE: + case TEP_PRINT_TYPE: break; - case PRINT_STRING: { + case TEP_PRINT_STRING: { int str_offset; if (arg->string.offset == -1) { - struct format_field *f; + struct tep_format_field *f; f = tep_find_any_field(event, arg->string.string); arg->string.offset = f->offset; } - str_offset = data2host4(pevent, data + arg->string.offset); + str_offset = tep_data2host4(pevent, data + arg->string.offset); str_offset &= 0xffff; print_str_to_seq(s, format, len_arg, ((char *)data) + str_offset); break; } - case PRINT_BSTRING: + case TEP_PRINT_BSTRING: print_str_to_seq(s, format, len_arg, arg->string.string); break; - case PRINT_BITMASK: { + case TEP_PRINT_BITMASK: { int bitmask_offset; int bitmask_size; if (arg->bitmask.offset == -1) { - struct format_field *f; + struct tep_format_field *f; f = tep_find_any_field(event, arg->bitmask.bitmask); arg->bitmask.offset = f->offset; } - bitmask_offset = data2host4(pevent, data + arg->bitmask.offset); + bitmask_offset = tep_data2host4(pevent, data + arg->bitmask.offset); bitmask_size = bitmask_offset >> 16; bitmask_offset &= 0xffff; print_bitmask_to_seq(pevent, s, format, len_arg, data + bitmask_offset, bitmask_size); break; } - case PRINT_OP: + case TEP_PRINT_OP: /* * The only op for string should be ? : */ @@ -4099,7 +4101,7 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, print_str_arg(s, data, size, event, format, len_arg, arg->op.right->op.right); break; - case PRINT_FUNC: + case TEP_PRINT_FUNC: process_defined_func(s, data, size, event, arg); break; default: @@ -4116,13 +4118,13 @@ out_warning_field: static unsigned long long process_defined_func(struct trace_seq *s, void *data, int size, - struct event_format *event, struct print_arg *arg) + struct tep_event_format *event, struct tep_print_arg *arg) { struct tep_function_handler *func_handle = arg->func.func; struct func_params *param; unsigned long long *args; unsigned long long ret; - struct print_arg *farg; + struct tep_print_arg *farg; struct trace_seq str; struct save_str { struct save_str *next; @@ -4199,9 +4201,9 @@ out_free: return ret; } -static void free_args(struct print_arg *args) +static void free_args(struct tep_print_arg *args) { - struct print_arg *next; + struct tep_print_arg *next; while (args) { next = args->next; @@ -4211,11 +4213,11 @@ static void free_args(struct print_arg *args) } } -static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struct event_format *event) +static struct tep_print_arg *make_bprint_args(char *fmt, void *data, int size, struct tep_event_format *event) { struct tep_handle *pevent = event->pevent; - struct format_field *field, *ip_field; - struct print_arg *args, *arg, **next; + struct tep_format_field *field, *ip_field; + struct tep_print_arg *args, *arg, **next; unsigned long long ip, val; char *ptr; void *bptr; @@ -4254,7 +4256,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc arg->next = NULL; next = &arg->next; - arg->type = PRINT_ATOM; + arg->type = TEP_PRINT_ATOM; if (asprintf(&arg->atom.atom, "%lld", ip) < 0) goto out_free; @@ -4342,7 +4344,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc goto out_free; } arg->next = NULL; - arg->type = PRINT_ATOM; + arg->type = TEP_PRINT_ATOM; if (asprintf(&arg->atom.atom, "%lld", val) < 0) { free(arg); goto out_free; @@ -4366,7 +4368,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc goto out_free; } arg->next = NULL; - arg->type = PRINT_BSTRING; + arg->type = TEP_PRINT_BSTRING; arg->string.string = strdup(bptr); if (!arg->string.string) goto out_free; @@ -4388,11 +4390,11 @@ out_free: static char * get_bprint_format(void *data, int size __maybe_unused, - struct event_format *event) + struct tep_event_format *event) { struct tep_handle *pevent = event->pevent; unsigned long long addr; - struct format_field *field; + struct tep_format_field *field; struct printk_map *printk; char *format; @@ -4423,17 +4425,17 @@ get_bprint_format(void *data, int size __maybe_unused, } static void print_mac_arg(struct trace_seq *s, int mac, void *data, int size, - struct event_format *event, struct print_arg *arg) + struct tep_event_format *event, struct tep_print_arg *arg) { unsigned char *buf; const char *fmt = "%.2x:%.2x:%.2x:%.2x:%.2x:%.2x"; - if (arg->type == PRINT_FUNC) { + if (arg->type == TEP_PRINT_FUNC) { process_defined_func(s, data, size, event, arg); return; } - if (arg->type != PRINT_FIELD) { + if (arg->type != TEP_PRINT_FIELD) { trace_seq_printf(s, "ARG TYPE NOT FIELD BUT %d", arg->type); return; @@ -4576,17 +4578,17 @@ static void print_ip6_addr(struct trace_seq *s, char i, unsigned char *buf) * %pISpc print an IP address based on sockaddr; p adds port. */ static int print_ipv4_arg(struct trace_seq *s, const char *ptr, char i, - void *data, int size, struct event_format *event, - struct print_arg *arg) + void *data, int size, struct tep_event_format *event, + struct tep_print_arg *arg) { unsigned char *buf; - if (arg->type == PRINT_FUNC) { + if (arg->type == TEP_PRINT_FUNC) { process_defined_func(s, data, size, event, arg); return 0; } - if (arg->type != PRINT_FIELD) { + if (arg->type != TEP_PRINT_FIELD) { trace_seq_printf(s, "ARG TYPE NOT FIELD BUT %d", arg->type); return 0; } @@ -4613,8 +4615,8 @@ static int print_ipv4_arg(struct trace_seq *s, const char *ptr, char i, } static int print_ipv6_arg(struct trace_seq *s, const char *ptr, char i, - void *data, int size, struct event_format *event, - struct print_arg *arg) + void *data, int size, struct tep_event_format *event, + struct tep_print_arg *arg) { char have_c = 0; unsigned char *buf; @@ -4627,12 +4629,12 @@ static int print_ipv6_arg(struct trace_seq *s, const char *ptr, char i, rc++; } - if (arg->type == PRINT_FUNC) { + if (arg->type == TEP_PRINT_FUNC) { process_defined_func(s, data, size, event, arg); return rc; } - if (arg->type != PRINT_FIELD) { + if (arg->type != TEP_PRINT_FIELD) { trace_seq_printf(s, "ARG TYPE NOT FIELD BUT %d", arg->type); return rc; } @@ -4663,8 +4665,8 @@ static int print_ipv6_arg(struct trace_seq *s, const char *ptr, char i, } static int print_ipsa_arg(struct trace_seq *s, const char *ptr, char i, - void *data, int size, struct event_format *event, - struct print_arg *arg) + void *data, int size, struct tep_event_format *event, + struct tep_print_arg *arg) { char have_c = 0, have_p = 0; unsigned char *buf; @@ -4685,12 +4687,12 @@ static int print_ipsa_arg(struct trace_seq *s, const char *ptr, char i, } } - if (arg->type == PRINT_FUNC) { + if (arg->type == TEP_PRINT_FUNC) { process_defined_func(s, data, size, event, arg); return rc; } - if (arg->type != PRINT_FIELD) { + if (arg->type != TEP_PRINT_FIELD) { trace_seq_printf(s, "ARG TYPE NOT FIELD BUT %d", arg->type); return rc; } @@ -4745,8 +4747,8 @@ static int print_ipsa_arg(struct trace_seq *s, const char *ptr, char i, } static int print_ip_arg(struct trace_seq *s, const char *ptr, - void *data, int size, struct event_format *event, - struct print_arg *arg) + void *data, int size, struct tep_event_format *event, + struct tep_print_arg *arg) { char i = *ptr; /* 'i' or 'I' */ char ver; @@ -4787,22 +4789,22 @@ static int is_printable_array(char *p, unsigned int len) } void tep_print_field(struct trace_seq *s, void *data, - struct format_field *field) + struct tep_format_field *field) { unsigned long long val; unsigned int offset, len, i; struct tep_handle *pevent = field->event->pevent; - if (field->flags & FIELD_IS_ARRAY) { + if (field->flags & TEP_FIELD_IS_ARRAY) { offset = field->offset; len = field->size; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { val = tep_read_number(pevent, data + offset, len); offset = val; len = offset >> 16; offset &= 0xffff; } - if (field->flags & FIELD_IS_STRING && + if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { trace_seq_printf(s, "%s", (char *)data + offset); } else { @@ -4814,21 +4816,21 @@ void tep_print_field(struct trace_seq *s, void *data, *((unsigned char *)data + offset + i)); } trace_seq_putc(s, ']'); - field->flags &= ~FIELD_IS_STRING; + field->flags &= ~TEP_FIELD_IS_STRING; } } else { val = tep_read_number(pevent, data + field->offset, field->size); - if (field->flags & FIELD_IS_POINTER) { + if (field->flags & TEP_FIELD_IS_POINTER) { trace_seq_printf(s, "0x%llx", val); - } else if (field->flags & FIELD_IS_SIGNED) { + } else if (field->flags & TEP_FIELD_IS_SIGNED) { switch (field->size) { case 4: /* * If field is long then print it in hex. * A long usually stores pointers. */ - if (field->flags & FIELD_IS_LONG) + if (field->flags & TEP_FIELD_IS_LONG) trace_seq_printf(s, "0x%x", (int)val); else trace_seq_printf(s, "%d", (int)val); @@ -4843,7 +4845,7 @@ void tep_print_field(struct trace_seq *s, void *data, trace_seq_printf(s, "%lld", val); } } else { - if (field->flags & FIELD_IS_LONG) + if (field->flags & TEP_FIELD_IS_LONG) trace_seq_printf(s, "0x%llx", val); else trace_seq_printf(s, "%llu", val); @@ -4852,9 +4854,9 @@ void tep_print_field(struct trace_seq *s, void *data, } void tep_print_fields(struct trace_seq *s, void *data, - int size __maybe_unused, struct event_format *event) + int size __maybe_unused, struct tep_event_format *event) { - struct format_field *field; + struct tep_format_field *field; field = event->format.fields; while (field) { @@ -4864,12 +4866,12 @@ void tep_print_fields(struct trace_seq *s, void *data, } } -static void pretty_print(struct trace_seq *s, void *data, int size, struct event_format *event) +static void pretty_print(struct trace_seq *s, void *data, int size, struct tep_event_format *event) { struct tep_handle *pevent = event->pevent; - struct print_fmt *print_fmt = &event->print_fmt; - struct print_arg *arg = print_fmt->args; - struct print_arg *args = NULL; + struct tep_print_fmt *print_fmt = &event->print_fmt; + struct tep_print_arg *arg = print_fmt->args; + struct tep_print_arg *args = NULL; const char *ptr = print_fmt->format; unsigned long long val; struct func_map *func; @@ -4883,13 +4885,13 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event int len; int ls; - if (event->flags & EVENT_FL_FAILED) { + if (event->flags & TEP_EVENT_FL_FAILED) { trace_seq_printf(s, "[FAILED TO PARSE]"); tep_print_fields(s, data, size, event); return; } - if (event->flags & EVENT_FL_ISBPRINT) { + if (event->flags & TEP_EVENT_FL_ISBPRINT) { bprint_fmt = get_bprint_format(data, size, event); args = make_bprint_args(bprint_fmt, data, size, event); arg = args; @@ -4944,7 +4946,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event /* The argument is the length. */ if (!arg) { do_warning_event(event, "no argument match"); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; goto out_failed; } len_arg = eval_num_arg(data, size, event, arg); @@ -4966,7 +4968,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event if (isalnum(ptr[1])) ptr++; - if (arg->type == PRINT_BSTRING) { + if (arg->type == TEP_PRINT_BSTRING) { trace_seq_puts(s, arg->string.string); break; } @@ -4997,7 +4999,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event case 'u': if (!arg) { do_warning_event(event, "no argument match"); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; goto out_failed; } @@ -5007,7 +5009,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event /* should never happen */ if (len > 31) { do_warning_event(event, "bad format!"); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; len = 31; } @@ -5073,13 +5075,13 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event break; default: do_warning_event(event, "bad count (%d)", ls); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; } break; case 's': if (!arg) { do_warning_event(event, "no matching argument"); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; goto out_failed; } @@ -5089,7 +5091,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event /* should never happen */ if (len > 31) { do_warning_event(event, "bad format!"); - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; len = 31; } @@ -5114,7 +5116,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event trace_seq_putc(s, *ptr); } - if (event->flags & EVENT_FL_FAILED) { + if (event->flags & TEP_EVENT_FL_FAILED) { out_failed: trace_seq_printf(s, "[FAILED TO PARSE]"); } @@ -5227,7 +5229,7 @@ int tep_data_type(struct tep_handle *pevent, struct tep_record *rec) * * This returns the event form a given @type; */ -struct event_format *tep_data_event_from_type(struct tep_handle *pevent, int type) +struct tep_event_format *tep_data_event_from_type(struct tep_handle *pevent, int type) { return tep_find_event(pevent, type); } @@ -5385,16 +5387,16 @@ int tep_cmdline_pid(struct tep_handle *pevent, struct cmdline *cmdline) * This parses the raw @data using the given @event information and * writes the print format into the trace_seq. */ -void tep_event_info(struct trace_seq *s, struct event_format *event, +void tep_event_info(struct trace_seq *s, struct tep_event_format *event, struct tep_record *record) { int print_pretty = 1; - if (event->pevent->print_raw || (event->flags & EVENT_FL_PRINTRAW)) + if (event->pevent->print_raw || (event->flags & TEP_EVENT_FL_PRINTRAW)) tep_print_fields(s, record->data, record->size, event); else { - if (event->handler && !(event->flags & EVENT_FL_NOHANDLE)) + if (event->handler && !(event->flags & TEP_EVENT_FL_NOHANDLE)) print_pretty = event->handler(s, record, event, event->context); @@ -5426,7 +5428,7 @@ static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock) * Returns the associated event for a given record, or NULL if non is * is found. */ -struct event_format * +struct tep_event_format * tep_find_event_by_record(struct tep_handle *pevent, struct tep_record *record) { int type; @@ -5451,7 +5453,7 @@ tep_find_event_by_record(struct tep_handle *pevent, struct tep_record *record) * Writes the tasks comm, pid and CPU to @s. */ void tep_print_event_task(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record) { void *data = record->data; @@ -5479,7 +5481,7 @@ void tep_print_event_task(struct tep_handle *pevent, struct trace_seq *s, * Writes the timestamp of the record into @s. */ void tep_print_event_time(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record, bool use_trace_clock) { @@ -5529,7 +5531,7 @@ void tep_print_event_time(struct tep_handle *pevent, struct trace_seq *s, * Writes the parsing of the record's data to @s. */ void tep_print_event_data(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record) { static const char *spaces = " "; /* 20 spaces */ @@ -5548,7 +5550,7 @@ void tep_print_event_data(struct tep_handle *pevent, struct trace_seq *s, void tep_print_event(struct tep_handle *pevent, struct trace_seq *s, struct tep_record *record, bool use_trace_clock) { - struct event_format *event; + struct tep_event_format *event; event = tep_find_event_by_record(pevent, record); if (!event) { @@ -5570,8 +5572,8 @@ void tep_print_event(struct tep_handle *pevent, struct trace_seq *s, static int events_id_cmp(const void *a, const void *b) { - struct event_format * const * ea = a; - struct event_format * const * eb = b; + struct tep_event_format * const * ea = a; + struct tep_event_format * const * eb = b; if ((*ea)->id < (*eb)->id) return -1; @@ -5584,8 +5586,8 @@ static int events_id_cmp(const void *a, const void *b) static int events_name_cmp(const void *a, const void *b) { - struct event_format * const * ea = a; - struct event_format * const * eb = b; + struct tep_event_format * const * ea = a; + struct tep_event_format * const * eb = b; int res; res = strcmp((*ea)->name, (*eb)->name); @@ -5601,8 +5603,8 @@ static int events_name_cmp(const void *a, const void *b) static int events_system_cmp(const void *a, const void *b) { - struct event_format * const * ea = a; - struct event_format * const * eb = b; + struct tep_event_format * const * ea = a; + struct tep_event_format * const * eb = b; int res; res = strcmp((*ea)->system, (*eb)->system); @@ -5616,9 +5618,9 @@ static int events_system_cmp(const void *a, const void *b) return events_id_cmp(a, b); } -struct event_format **tep_list_events(struct tep_handle *pevent, enum event_sort_type sort_type) +struct tep_event_format **tep_list_events(struct tep_handle *pevent, enum tep_event_sort_type sort_type) { - struct event_format **events; + struct tep_event_format **events; int (*sort)(const void *a, const void *b); events = pevent->sort_events; @@ -5637,20 +5639,20 @@ struct event_format **tep_list_events(struct tep_handle *pevent, enum event_sort pevent->sort_events = events; /* the internal events are sorted by id */ - if (sort_type == EVENT_SORT_ID) { + if (sort_type == TEP_EVENT_SORT_ID) { pevent->last_type = sort_type; return events; } } switch (sort_type) { - case EVENT_SORT_ID: + case TEP_EVENT_SORT_ID: sort = events_id_cmp; break; - case EVENT_SORT_NAME: + case TEP_EVENT_SORT_NAME: sort = events_name_cmp; break; - case EVENT_SORT_SYSTEM: + case TEP_EVENT_SORT_SYSTEM: sort = events_system_cmp; break; default: @@ -5663,12 +5665,12 @@ struct event_format **tep_list_events(struct tep_handle *pevent, enum event_sort return events; } -static struct format_field ** +static struct tep_format_field ** get_event_fields(const char *type, const char *name, - int count, struct format_field *list) + int count, struct tep_format_field *list) { - struct format_field **fields; - struct format_field *field; + struct tep_format_field **fields; + struct tep_format_field *field; int i = 0; fields = malloc(sizeof(*fields) * (count + 1)); @@ -5701,7 +5703,7 @@ get_event_fields(const char *type, const char *name, * Returns an allocated array of fields. The last item in the array is NULL. * The array must be freed with free(). */ -struct format_field **tep_event_common_fields(struct event_format *event) +struct tep_format_field **tep_event_common_fields(struct tep_event_format *event) { return get_event_fields("common", event->name, event->format.nr_common, @@ -5715,14 +5717,14 @@ struct format_field **tep_event_common_fields(struct event_format *event) * Returns an allocated array of fields. The last item in the array is NULL. * The array must be freed with free(). */ -struct format_field **tep_event_fields(struct event_format *event) +struct tep_format_field **tep_event_fields(struct tep_event_format *event) { return get_event_fields("event", event->name, event->format.nr_fields, event->format.fields); } -static void print_fields(struct trace_seq *s, struct print_flag_sym *field) +static void print_fields(struct trace_seq *s, struct tep_print_flag_sym *field) { trace_seq_printf(s, "{ %s, %s }", field->value, field->str); if (field->next) { @@ -5732,22 +5734,22 @@ static void print_fields(struct trace_seq *s, struct print_flag_sym *field) } /* for debugging */ -static void print_args(struct print_arg *args) +static void print_args(struct tep_print_arg *args) { int print_paren = 1; struct trace_seq s; switch (args->type) { - case PRINT_NULL: + case TEP_PRINT_NULL: printf("null"); break; - case PRINT_ATOM: + case TEP_PRINT_ATOM: printf("%s", args->atom.atom); break; - case PRINT_FIELD: + case TEP_PRINT_FIELD: printf("REC->%s", args->field.name); break; - case PRINT_FLAGS: + case TEP_PRINT_FLAGS: printf("__print_flags("); print_args(args->flags.field); printf(", %s, ", args->flags.delim); @@ -5757,7 +5759,7 @@ static void print_args(struct print_arg *args) trace_seq_destroy(&s); printf(")"); break; - case PRINT_SYMBOL: + case TEP_PRINT_SYMBOL: printf("__print_symbolic("); print_args(args->symbol.field); printf(", "); @@ -5767,21 +5769,21 @@ static void print_args(struct print_arg *args) trace_seq_destroy(&s); printf(")"); break; - case PRINT_HEX: + case TEP_PRINT_HEX: printf("__print_hex("); print_args(args->hex.field); printf(", "); print_args(args->hex.size); printf(")"); break; - case PRINT_HEX_STR: + case TEP_PRINT_HEX_STR: printf("__print_hex_str("); print_args(args->hex.field); printf(", "); print_args(args->hex.size); printf(")"); break; - case PRINT_INT_ARRAY: + case TEP_PRINT_INT_ARRAY: printf("__print_array("); print_args(args->int_array.field); printf(", "); @@ -5790,18 +5792,18 @@ static void print_args(struct print_arg *args) print_args(args->int_array.el_size); printf(")"); break; - case PRINT_STRING: - case PRINT_BSTRING: + case TEP_PRINT_STRING: + case TEP_PRINT_BSTRING: printf("__get_str(%s)", args->string.string); break; - case PRINT_BITMASK: + case TEP_PRINT_BITMASK: printf("__get_bitmask(%s)", args->bitmask.bitmask); break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: printf("(%s)", args->typecast.type); print_args(args->typecast.item); break; - case PRINT_OP: + case TEP_PRINT_OP: if (strcmp(args->op.op, ":") == 0) print_paren = 0; if (print_paren) @@ -5833,13 +5835,13 @@ static void parse_header_field(const char *field, save_input_buf_ptr = input_buf_ptr; save_input_buf_siz = input_buf_siz; - if (read_expected(EVENT_ITEM, "field") < 0) + if (read_expected(TEP_EVENT_ITEM, "field") < 0) return; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return; /* type */ - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; free_token(token); @@ -5847,42 +5849,42 @@ static void parse_header_field(const char *field, * If this is not a mandatory field, then test it first. */ if (mandatory) { - if (read_expected(EVENT_ITEM, field) < 0) + if (read_expected(TEP_EVENT_ITEM, field) < 0) return; } else { - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; if (strcmp(token, field) != 0) goto discard; free_token(token); } - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) return; - if (read_expected(EVENT_ITEM, "offset") < 0) + if (read_expected(TEP_EVENT_ITEM, "offset") < 0) return; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; *offset = atoi(token); free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) return; - if (read_expected(EVENT_ITEM, "size") < 0) + if (read_expected(TEP_EVENT_ITEM, "size") < 0) return; - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return; - if (read_expect_type(EVENT_ITEM, &token) < 0) + if (read_expect_type(TEP_EVENT_ITEM, &token) < 0) goto fail; *size = atoi(token); free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) return; type = read_token(&token); - if (type != EVENT_NEWLINE) { + if (type != TEP_EVENT_NEWLINE) { /* newer versions of the kernel have a "signed" type */ - if (type != EVENT_ITEM) + if (type != TEP_EVENT_ITEM) goto fail; if (strcmp(token, "signed") != 0) @@ -5890,17 +5892,17 @@ static void parse_header_field(const char *field, free_token(token); - if (read_expected(EVENT_OP, ":") < 0) + if (read_expected(TEP_EVENT_OP, ":") < 0) return; - if (read_expect_type(EVENT_ITEM, &token)) + if (read_expect_type(TEP_EVENT_ITEM, &token)) goto fail; free_token(token); - if (read_expected(EVENT_OP, ";") < 0) + if (read_expected(TEP_EVENT_OP, ";") < 0) return; - if (read_expect_type(EVENT_NEWLINE, &token)) + if (read_expect_type(TEP_EVENT_NEWLINE, &token)) goto fail; } fail: @@ -5957,7 +5959,7 @@ int tep_parse_header_page(struct tep_handle *pevent, char *buf, unsigned long si return 0; } -static int event_matches(struct event_format *event, +static int event_matches(struct tep_event_format *event, int id, const char *sys_name, const char *event_name) { @@ -5980,7 +5982,7 @@ static void free_handler(struct event_handler *handle) free(handle); } -static int find_event_handle(struct tep_handle *pevent, struct event_format *event) +static int find_event_handle(struct tep_handle *pevent, struct tep_event_format *event) { struct event_handler *handle, **next; @@ -6021,11 +6023,11 @@ static int find_event_handle(struct tep_handle *pevent, struct event_format *eve * * /sys/kernel/debug/tracing/events/.../.../format */ -enum tep_errno __tep_parse_format(struct event_format **eventp, +enum tep_errno __tep_parse_format(struct tep_event_format **eventp, struct tep_handle *pevent, const char *buf, unsigned long size, const char *sys) { - struct event_format *event; + struct tep_event_format *event; int ret; init_input_buf(buf, size); @@ -6042,10 +6044,10 @@ enum tep_errno __tep_parse_format(struct event_format **eventp, } if (strcmp(sys, "ftrace") == 0) { - event->flags |= EVENT_FL_ISFTRACE; + event->flags |= TEP_EVENT_FL_ISFTRACE; if (strcmp(event->name, "bprint") == 0) - event->flags |= EVENT_FL_ISBPRINT; + event->flags |= TEP_EVENT_FL_ISBPRINT; } event->id = event_read_id(); @@ -6088,22 +6090,22 @@ enum tep_errno __tep_parse_format(struct event_format **eventp, goto event_parse_failed; } - if (!ret && (event->flags & EVENT_FL_ISFTRACE)) { - struct format_field *field; - struct print_arg *arg, **list; + if (!ret && (event->flags & TEP_EVENT_FL_ISFTRACE)) { + struct tep_format_field *field; + struct tep_print_arg *arg, **list; /* old ftrace had no args */ list = &event->print_fmt.args; for (field = event->format.fields; field; field = field->next) { arg = alloc_arg(); if (!arg) { - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; return TEP_ERRNO__OLD_FTRACE_ARG_FAILED; } - arg->type = PRINT_FIELD; + arg->type = TEP_PRINT_FIELD; arg->field.name = strdup(field->name); if (!arg->field.name) { - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; free_arg(arg); return TEP_ERRNO__OLD_FTRACE_ARG_FAILED; } @@ -6117,7 +6119,7 @@ enum tep_errno __tep_parse_format(struct event_format **eventp, return 0; event_parse_failed: - event->flags |= EVENT_FL_FAILED; + event->flags |= TEP_EVENT_FL_FAILED; return ret; event_alloc_failed: @@ -6130,12 +6132,12 @@ enum tep_errno __tep_parse_format(struct event_format **eventp, static enum tep_errno __parse_event(struct tep_handle *pevent, - struct event_format **eventp, + struct tep_event_format **eventp, const char *buf, unsigned long size, const char *sys) { int ret = __tep_parse_format(eventp, pevent, buf, size, sys); - struct event_format *event = *eventp; + struct tep_event_format *event = *eventp; if (event == NULL) return ret; @@ -6172,7 +6174,7 @@ event_add_failed: * /sys/kernel/debug/tracing/events/.../.../format */ enum tep_errno tep_parse_format(struct tep_handle *pevent, - struct event_format **eventp, + struct tep_event_format **eventp, const char *buf, unsigned long size, const char *sys) { @@ -6196,40 +6198,11 @@ enum tep_errno tep_parse_format(struct tep_handle *pevent, enum tep_errno tep_parse_event(struct tep_handle *pevent, const char *buf, unsigned long size, const char *sys) { - struct event_format *event = NULL; + struct tep_event_format *event = NULL; return __parse_event(pevent, &event, buf, size, sys); } -#undef _PE -#define _PE(code, str) str -static const char * const tep_error_str[] = { - TEP_ERRORS -}; -#undef _PE - -int tep_strerror(struct tep_handle *pevent __maybe_unused, - enum tep_errno errnum, char *buf, size_t buflen) -{ - int idx; - const char *msg; - - if (errnum >= 0) { - str_error_r(errnum, buf, buflen); - return 0; - } - - if (errnum <= __TEP_ERRNO__START || - errnum >= __TEP_ERRNO__END) - return -1; - - idx = errnum - __TEP_ERRNO__START - 1; - msg = tep_error_str[idx]; - snprintf(buf, buflen, "%s", msg); - - return 0; -} - -int get_field_val(struct trace_seq *s, struct format_field *field, +int get_field_val(struct trace_seq *s, struct tep_format_field *field, const char *name, struct tep_record *record, unsigned long long *val, int err) { @@ -6262,11 +6235,11 @@ int get_field_val(struct trace_seq *s, struct format_field *field, * * On failure, it returns NULL. */ -void *tep_get_field_raw(struct trace_seq *s, struct event_format *event, +void *tep_get_field_raw(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, int *len, int err) { - struct format_field *field; + struct tep_format_field *field; void *data = record->data; unsigned offset; int dummy; @@ -6287,7 +6260,7 @@ void *tep_get_field_raw(struct trace_seq *s, struct event_format *event, len = &dummy; offset = field->offset; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { offset = tep_read_number(event->pevent, data + offset, field->size); *len = offset >> 16; @@ -6309,11 +6282,11 @@ void *tep_get_field_raw(struct trace_seq *s, struct event_format *event, * * Returns 0 on success -1 on field not found. */ -int tep_get_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err) { - struct format_field *field; + struct tep_format_field *field; if (!event) return -1; @@ -6334,11 +6307,11 @@ int tep_get_field_val(struct trace_seq *s, struct event_format *event, * * Returns 0 on success -1 on field not found. */ -int tep_get_common_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_common_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err) { - struct format_field *field; + struct tep_format_field *field; if (!event) return -1; @@ -6359,11 +6332,11 @@ int tep_get_common_field_val(struct trace_seq *s, struct event_format *event, * * Returns 0 on success -1 on field not found. */ -int tep_get_any_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_any_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err) { - struct format_field *field; + struct tep_format_field *field; if (!event) return -1; @@ -6385,10 +6358,10 @@ int tep_get_any_field_val(struct trace_seq *s, struct event_format *event, * Returns: 0 on success, -1 field not found, or 1 if buffer is full. */ int tep_print_num_field(struct trace_seq *s, const char *fmt, - struct event_format *event, const char *name, + struct tep_event_format *event, const char *name, struct tep_record *record, int err) { - struct format_field *field = tep_find_field(event, name); + struct tep_format_field *field = tep_find_field(event, name); unsigned long long val; if (!field) @@ -6417,10 +6390,10 @@ int tep_print_num_field(struct trace_seq *s, const char *fmt, * Returns: 0 on success, -1 field not found, or 1 if buffer is full. */ int tep_print_func_field(struct trace_seq *s, const char *fmt, - struct event_format *event, const char *name, + struct tep_event_format *event, const char *name, struct tep_record *record, int err) { - struct format_field *field = tep_find_field(event, name); + struct tep_format_field *field = tep_find_field(event, name); struct tep_handle *pevent = event->pevent; unsigned long long val; struct func_map *func; @@ -6577,11 +6550,11 @@ int tep_unregister_print_function(struct tep_handle *pevent, return -1; } -static struct event_format *search_event(struct tep_handle *pevent, int id, +static struct tep_event_format *search_event(struct tep_handle *pevent, int id, const char *sys_name, const char *event_name) { - struct event_format *event; + struct tep_event_format *event; if (id >= 0) { /* search by id */ @@ -6621,7 +6594,7 @@ int tep_register_event_handler(struct tep_handle *pevent, int id, const char *sys_name, const char *event_name, tep_event_handler_func func, void *context) { - struct event_format *event; + struct tep_event_format *event; struct event_handler *handle; event = search_event(pevent, id, sys_name, event_name); @@ -6705,7 +6678,7 @@ int tep_unregister_event_handler(struct tep_handle *pevent, int id, const char *sys_name, const char *event_name, tep_event_handler_func func, void *context) { - struct event_format *event; + struct tep_event_format *event; struct event_handler *handle; struct event_handler **next; @@ -6757,7 +6730,7 @@ void tep_ref(struct tep_handle *pevent) pevent->ref_count++; } -void tep_free_format_field(struct format_field *field) +void tep_free_format_field(struct tep_format_field *field) { free(field->type); if (field->alias != field->name) @@ -6766,9 +6739,9 @@ void tep_free_format_field(struct format_field *field) free(field); } -static void free_format_fields(struct format_field *field) +static void free_format_fields(struct tep_format_field *field) { - struct format_field *next; + struct tep_format_field *next; while (field) { next = field->next; @@ -6777,13 +6750,13 @@ static void free_format_fields(struct format_field *field) } } -static void free_formats(struct format *format) +static void free_formats(struct tep_format *format) { free_format_fields(format->common_fields); free_format_fields(format->fields); } -void tep_free_format(struct event_format *event) +void tep_free_format(struct tep_event_format *event) { free(event->name); free(event->system); diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 44b7c2d41f9f..16bf4c890b6f 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -26,17 +26,12 @@ #include #include +#include "trace-seq.h" + #ifndef __maybe_unused #define __maybe_unused __attribute__((unused)) #endif -/* ----------------------- trace_seq ----------------------- */ - - -#ifndef TRACE_SEQ_BUF_SIZE -#define TRACE_SEQ_BUF_SIZE 4096 -#endif - #ifndef DEBUG_RECORD #define DEBUG_RECORD 0 #endif @@ -59,51 +54,14 @@ struct tep_record { #endif }; -enum trace_seq_fail { - TRACE_SEQ__GOOD, - TRACE_SEQ__BUFFER_POISONED, - TRACE_SEQ__MEM_ALLOC_FAILED, -}; - -/* - * Trace sequences are used to allow a function to call several other functions - * to create a string of data to use (up to a max of PAGE_SIZE). - */ - -struct trace_seq { - char *buffer; - unsigned int buffer_size; - unsigned int len; - unsigned int readpos; - enum trace_seq_fail state; -}; - -void trace_seq_init(struct trace_seq *s); -void trace_seq_reset(struct trace_seq *s); -void trace_seq_destroy(struct trace_seq *s); - -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) - __attribute__ ((format (printf, 2, 0))); - -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); - -extern void trace_seq_terminate(struct trace_seq *s); - -extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp); -extern int trace_seq_do_printf(struct trace_seq *s); - - -/* ----------------------- pevent ----------------------- */ +/* ----------------------- tep ----------------------- */ struct tep_handle; -struct event_format; +struct tep_event_format; typedef int (*tep_event_handler_func)(struct trace_seq *s, struct tep_record *record, - struct event_format *event, + struct tep_event_format *event, void *context); typedef int (*tep_plugin_load_func)(struct tep_handle *pevent); @@ -172,20 +130,20 @@ struct tep_plugin_option { #define TEP_PLUGIN_OPTIONS_NAME MAKE_STR(TEP_PLUGIN_OPTIONS) #define TEP_PLUGIN_ALIAS_NAME MAKE_STR(TEP_PLUGIN_ALIAS) -enum format_flags { - FIELD_IS_ARRAY = 1, - FIELD_IS_POINTER = 2, - FIELD_IS_SIGNED = 4, - FIELD_IS_STRING = 8, - FIELD_IS_DYNAMIC = 16, - FIELD_IS_LONG = 32, - FIELD_IS_FLAG = 64, - FIELD_IS_SYMBOLIC = 128, +enum tep_format_flags { + TEP_FIELD_IS_ARRAY = 1, + TEP_FIELD_IS_POINTER = 2, + TEP_FIELD_IS_SIGNED = 4, + TEP_FIELD_IS_STRING = 8, + TEP_FIELD_IS_DYNAMIC = 16, + TEP_FIELD_IS_LONG = 32, + TEP_FIELD_IS_FLAG = 64, + TEP_FIELD_IS_SYMBOLIC = 128, }; -struct format_field { - struct format_field *next; - struct event_format *event; +struct tep_format_field { + struct tep_format_field *next; + struct tep_event_format *event; char *type; char *name; char *alias; @@ -196,169 +154,169 @@ struct format_field { unsigned long flags; }; -struct format { +struct tep_format { int nr_common; int nr_fields; - struct format_field *common_fields; - struct format_field *fields; + struct tep_format_field *common_fields; + struct tep_format_field *fields; }; -struct print_arg_atom { +struct tep_print_arg_atom { char *atom; }; -struct print_arg_string { +struct tep_print_arg_string { char *string; int offset; }; -struct print_arg_bitmask { +struct tep_print_arg_bitmask { char *bitmask; int offset; }; -struct print_arg_field { +struct tep_print_arg_field { char *name; - struct format_field *field; + struct tep_format_field *field; }; -struct print_flag_sym { - struct print_flag_sym *next; - char *value; - char *str; +struct tep_print_flag_sym { + struct tep_print_flag_sym *next; + char *value; + char *str; }; -struct print_arg_typecast { +struct tep_print_arg_typecast { char *type; - struct print_arg *item; + struct tep_print_arg *item; }; -struct print_arg_flags { - struct print_arg *field; - char *delim; - struct print_flag_sym *flags; +struct tep_print_arg_flags { + struct tep_print_arg *field; + char *delim; + struct tep_print_flag_sym *flags; }; -struct print_arg_symbol { - struct print_arg *field; - struct print_flag_sym *symbols; +struct tep_print_arg_symbol { + struct tep_print_arg *field; + struct tep_print_flag_sym *symbols; }; -struct print_arg_hex { - struct print_arg *field; - struct print_arg *size; +struct tep_print_arg_hex { + struct tep_print_arg *field; + struct tep_print_arg *size; }; -struct print_arg_int_array { - struct print_arg *field; - struct print_arg *count; - struct print_arg *el_size; +struct tep_print_arg_int_array { + struct tep_print_arg *field; + struct tep_print_arg *count; + struct tep_print_arg *el_size; }; -struct print_arg_dynarray { - struct format_field *field; - struct print_arg *index; +struct tep_print_arg_dynarray { + struct tep_format_field *field; + struct tep_print_arg *index; }; -struct print_arg; +struct tep_print_arg; -struct print_arg_op { +struct tep_print_arg_op { char *op; int prio; - struct print_arg *left; - struct print_arg *right; + struct tep_print_arg *left; + struct tep_print_arg *right; }; struct tep_function_handler; -struct print_arg_func { +struct tep_print_arg_func { struct tep_function_handler *func; - struct print_arg *args; -}; - -enum print_arg_type { - PRINT_NULL, - PRINT_ATOM, - PRINT_FIELD, - PRINT_FLAGS, - PRINT_SYMBOL, - PRINT_HEX, - PRINT_INT_ARRAY, - PRINT_TYPE, - PRINT_STRING, - PRINT_BSTRING, - PRINT_DYNAMIC_ARRAY, - PRINT_OP, - PRINT_FUNC, - PRINT_BITMASK, - PRINT_DYNAMIC_ARRAY_LEN, - PRINT_HEX_STR, -}; - -struct print_arg { - struct print_arg *next; - enum print_arg_type type; + struct tep_print_arg *args; +}; + +enum tep_print_arg_type { + TEP_PRINT_NULL, + TEP_PRINT_ATOM, + TEP_PRINT_FIELD, + TEP_PRINT_FLAGS, + TEP_PRINT_SYMBOL, + TEP_PRINT_HEX, + TEP_PRINT_INT_ARRAY, + TEP_PRINT_TYPE, + TEP_PRINT_STRING, + TEP_PRINT_BSTRING, + TEP_PRINT_DYNAMIC_ARRAY, + TEP_PRINT_OP, + TEP_PRINT_FUNC, + TEP_PRINT_BITMASK, + TEP_PRINT_DYNAMIC_ARRAY_LEN, + TEP_PRINT_HEX_STR, +}; + +struct tep_print_arg { + struct tep_print_arg *next; + enum tep_print_arg_type type; union { - struct print_arg_atom atom; - struct print_arg_field field; - struct print_arg_typecast typecast; - struct print_arg_flags flags; - struct print_arg_symbol symbol; - struct print_arg_hex hex; - struct print_arg_int_array int_array; - struct print_arg_func func; - struct print_arg_string string; - struct print_arg_bitmask bitmask; - struct print_arg_op op; - struct print_arg_dynarray dynarray; + struct tep_print_arg_atom atom; + struct tep_print_arg_field field; + struct tep_print_arg_typecast typecast; + struct tep_print_arg_flags flags; + struct tep_print_arg_symbol symbol; + struct tep_print_arg_hex hex; + struct tep_print_arg_int_array int_array; + struct tep_print_arg_func func; + struct tep_print_arg_string string; + struct tep_print_arg_bitmask bitmask; + struct tep_print_arg_op op; + struct tep_print_arg_dynarray dynarray; }; }; -struct print_fmt { +struct tep_print_fmt { char *format; - struct print_arg *args; + struct tep_print_arg *args; }; -struct event_format { +struct tep_event_format { struct tep_handle *pevent; char *name; int id; int flags; - struct format format; - struct print_fmt print_fmt; + struct tep_format format; + struct tep_print_fmt print_fmt; char *system; tep_event_handler_func handler; void *context; }; enum { - EVENT_FL_ISFTRACE = 0x01, - EVENT_FL_ISPRINT = 0x02, - EVENT_FL_ISBPRINT = 0x04, - EVENT_FL_ISFUNCENT = 0x10, - EVENT_FL_ISFUNCRET = 0x20, - EVENT_FL_NOHANDLE = 0x40, - EVENT_FL_PRINTRAW = 0x80, + TEP_EVENT_FL_ISFTRACE = 0x01, + TEP_EVENT_FL_ISPRINT = 0x02, + TEP_EVENT_FL_ISBPRINT = 0x04, + TEP_EVENT_FL_ISFUNCENT = 0x10, + TEP_EVENT_FL_ISFUNCRET = 0x20, + TEP_EVENT_FL_NOHANDLE = 0x40, + TEP_EVENT_FL_PRINTRAW = 0x80, - EVENT_FL_FAILED = 0x80000000 + TEP_EVENT_FL_FAILED = 0x80000000 }; -enum event_sort_type { - EVENT_SORT_ID, - EVENT_SORT_NAME, - EVENT_SORT_SYSTEM, +enum tep_event_sort_type { + TEP_EVENT_SORT_ID, + TEP_EVENT_SORT_NAME, + TEP_EVENT_SORT_SYSTEM, }; -enum event_type { - EVENT_ERROR, - EVENT_NONE, - EVENT_SPACE, - EVENT_NEWLINE, - EVENT_OP, - EVENT_DELIM, - EVENT_ITEM, - EVENT_DQUOTE, - EVENT_SQUOTE, +enum tep_event_type { + TEP_EVENT_ERROR, + TEP_EVENT_NONE, + TEP_EVENT_SPACE, + TEP_EVENT_NEWLINE, + TEP_EVENT_OP, + TEP_EVENT_DELIM, + TEP_EVENT_ITEM, + TEP_EVENT_DQUOTE, + TEP_EVENT_SQUOTE, }; typedef unsigned long long (*tep_func_handler)(struct trace_seq *s, @@ -431,12 +389,12 @@ enum tep_errno { }; #undef _PE -struct plugin_list; +struct tep_plugin_list; #define INVALID_PLUGIN_LIST_OPTION ((char **)((unsigned long)-1)) -struct plugin_list *tep_load_plugins(struct tep_handle *pevent); -void tep_unload_plugins(struct plugin_list *plugin_list, +struct tep_plugin_list *tep_load_plugins(struct tep_handle *pevent); +void tep_unload_plugins(struct tep_plugin_list *plugin_list, struct tep_handle *pevent); char **tep_plugin_list_options(void); void tep_plugin_free_options_list(char **list); @@ -445,156 +403,25 @@ int tep_plugin_add_options(const char *name, void tep_plugin_remove_options(struct tep_plugin_option *options); void tep_print_plugins(struct trace_seq *s, const char *prefix, const char *suffix, - const struct plugin_list *list); - -struct cmdline; -struct cmdline_list; -struct func_map; -struct func_list; -struct event_handler; -struct func_resolver; + const struct tep_plugin_list *list); +/* tep_handle */ typedef char *(tep_func_resolver_t)(void *priv, unsigned long long *addrp, char **modp); +void tep_set_flag(struct tep_handle *tep, int flag); +unsigned short __tep_data2host2(struct tep_handle *pevent, unsigned short data); +unsigned int __tep_data2host4(struct tep_handle *pevent, unsigned int data); +unsigned long long +__tep_data2host8(struct tep_handle *pevent, unsigned long long data); -struct tep_handle { - int ref_count; - - int header_page_ts_offset; - int header_page_ts_size; - int header_page_size_offset; - int header_page_size_size; - int header_page_data_offset; - int header_page_data_size; - int header_page_overwrite; - - int file_bigendian; - int host_bigendian; - - int latency_format; - - int old_format; - - int cpus; - int long_size; - int page_size; - - struct cmdline *cmdlines; - struct cmdline_list *cmdlist; - int cmdline_count; - - struct func_map *func_map; - struct func_resolver *func_resolver; - struct func_list *funclist; - unsigned int func_count; - - struct printk_map *printk_map; - struct printk_list *printklist; - unsigned int printk_count; - - - struct event_format **events; - int nr_events; - struct event_format **sort_events; - enum event_sort_type last_type; - - int type_offset; - int type_size; - - int pid_offset; - int pid_size; - - int pc_offset; - int pc_size; - - int flags_offset; - int flags_size; - - int ld_offset; - int ld_size; - - int print_raw; - - int test_filters; - - int flags; - - struct format_field *bprint_ip_field; - struct format_field *bprint_fmt_field; - struct format_field *bprint_buf_field; - - struct event_handler *handlers; - struct tep_function_handler *func_handlers; - - /* cache */ - struct event_format *last_event; - - char *trace_clock; -}; - -static inline void tep_set_flag(struct tep_handle *pevent, int flag) -{ - pevent->flags |= flag; -} - -static inline unsigned short -__data2host2(struct tep_handle *pevent, unsigned short data) -{ - unsigned short swap; - - if (pevent->host_bigendian == pevent->file_bigendian) - return data; - - swap = ((data & 0xffULL) << 8) | - ((data & (0xffULL << 8)) >> 8); - - return swap; -} - -static inline unsigned int -__data2host4(struct tep_handle *pevent, unsigned int data) -{ - unsigned int swap; - - if (pevent->host_bigendian == pevent->file_bigendian) - return data; - - swap = ((data & 0xffULL) << 24) | - ((data & (0xffULL << 8)) << 8) | - ((data & (0xffULL << 16)) >> 8) | - ((data & (0xffULL << 24)) >> 24); - - return swap; -} - -static inline unsigned long long -__data2host8(struct tep_handle *pevent, unsigned long long data) -{ - unsigned long long swap; - - if (pevent->host_bigendian == pevent->file_bigendian) - return data; - - swap = ((data & 0xffULL) << 56) | - ((data & (0xffULL << 8)) << 40) | - ((data & (0xffULL << 16)) << 24) | - ((data & (0xffULL << 24)) << 8) | - ((data & (0xffULL << 32)) >> 8) | - ((data & (0xffULL << 40)) >> 24) | - ((data & (0xffULL << 48)) >> 40) | - ((data & (0xffULL << 56)) >> 56); - - return swap; -} - -#define data2host2(pevent, ptr) __data2host2(pevent, *(unsigned short *)(ptr)) -#define data2host4(pevent, ptr) __data2host4(pevent, *(unsigned int *)(ptr)) -#define data2host8(pevent, ptr) \ +#define tep_data2host2(pevent, ptr) __tep_data2host2(pevent, *(unsigned short *)(ptr)) +#define tep_data2host4(pevent, ptr) __tep_data2host4(pevent, *(unsigned int *)(ptr)) +#define tep_data2host8(pevent, ptr) \ ({ \ unsigned long long __val; \ \ memcpy(&__val, (ptr), sizeof(unsigned long long)); \ - __data2host8(pevent, __val); \ + __tep_data2host8(pevent, __val); \ }) static inline int tep_host_bigendian(void) @@ -627,14 +454,14 @@ int tep_register_print_string(struct tep_handle *pevent, const char *fmt, int tep_pid_is_registered(struct tep_handle *pevent, int pid); void tep_print_event_task(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record); void tep_print_event_time(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record, bool use_trace_clock); void tep_print_event_data(struct tep_handle *pevent, struct trace_seq *s, - struct event_format *event, + struct tep_event_format *event, struct tep_record *record); void tep_print_event(struct tep_handle *pevent, struct trace_seq *s, struct tep_record *record, bool use_trace_clock); @@ -645,32 +472,32 @@ int tep_parse_header_page(struct tep_handle *pevent, char *buf, unsigned long si enum tep_errno tep_parse_event(struct tep_handle *pevent, const char *buf, unsigned long size, const char *sys); enum tep_errno tep_parse_format(struct tep_handle *pevent, - struct event_format **eventp, + struct tep_event_format **eventp, const char *buf, unsigned long size, const char *sys); -void tep_free_format(struct event_format *event); -void tep_free_format_field(struct format_field *field); +void tep_free_format(struct tep_event_format *event); +void tep_free_format_field(struct tep_format_field *field); -void *tep_get_field_raw(struct trace_seq *s, struct event_format *event, +void *tep_get_field_raw(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, int *len, int err); -int tep_get_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err); -int tep_get_common_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_common_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err); -int tep_get_any_field_val(struct trace_seq *s, struct event_format *event, +int tep_get_any_field_val(struct trace_seq *s, struct tep_event_format *event, const char *name, struct tep_record *record, unsigned long long *val, int err); int tep_print_num_field(struct trace_seq *s, const char *fmt, - struct event_format *event, const char *name, - struct tep_record *record, int err); + struct tep_event_format *event, const char *name, + struct tep_record *record, int err); int tep_print_func_field(struct trace_seq *s, const char *fmt, - struct event_format *event, const char *name, + struct tep_event_format *event, const char *name, struct tep_record *record, int err); int tep_register_event_handler(struct tep_handle *pevent, int id, @@ -686,29 +513,30 @@ int tep_register_print_function(struct tep_handle *pevent, int tep_unregister_print_function(struct tep_handle *pevent, tep_func_handler func, char *name); -struct format_field *tep_find_common_field(struct event_format *event, const char *name); -struct format_field *tep_find_field(struct event_format *event, const char *name); -struct format_field *tep_find_any_field(struct event_format *event, const char *name); +struct tep_format_field *tep_find_common_field(struct tep_event_format *event, const char *name); +struct tep_format_field *tep_find_field(struct tep_event_format *event, const char *name); +struct tep_format_field *tep_find_any_field(struct tep_event_format *event, const char *name); const char *tep_find_function(struct tep_handle *pevent, unsigned long long addr); unsigned long long tep_find_function_address(struct tep_handle *pevent, unsigned long long addr); unsigned long long tep_read_number(struct tep_handle *pevent, const void *ptr, int size); -int tep_read_number_field(struct format_field *field, const void *data, +int tep_read_number_field(struct tep_format_field *field, const void *data, unsigned long long *value); -struct event_format *tep_find_event(struct tep_handle *pevent, int id); +struct tep_event_format *tep_get_first_event(struct tep_handle *tep); +int tep_get_events_count(struct tep_handle *tep); +struct tep_event_format *tep_find_event(struct tep_handle *pevent, int id); -struct event_format * +struct tep_event_format * tep_find_event_by_name(struct tep_handle *pevent, const char *sys, const char *name); - -struct event_format * +struct tep_event_format * tep_find_event_by_record(struct tep_handle *pevent, struct tep_record *record); void tep_data_lat_fmt(struct tep_handle *pevent, struct trace_seq *s, struct tep_record *record); int tep_data_type(struct tep_handle *pevent, struct tep_record *rec); -struct event_format *tep_data_event_from_type(struct tep_handle *pevent, int type); +struct tep_event_format *tep_data_event_from_type(struct tep_handle *pevent, int type); int tep_data_pid(struct tep_handle *pevent, struct tep_record *rec); int tep_data_preempt_count(struct tep_handle *pevent, struct tep_record *rec); int tep_data_flags(struct tep_handle *pevent, struct tep_record *rec); @@ -719,77 +547,35 @@ struct cmdline *tep_data_pid_from_comm(struct tep_handle *pevent, const char *co int tep_cmdline_pid(struct tep_handle *pevent, struct cmdline *cmdline); void tep_print_field(struct trace_seq *s, void *data, - struct format_field *field); + struct tep_format_field *field); void tep_print_fields(struct trace_seq *s, void *data, - int size __maybe_unused, struct event_format *event); -void tep_event_info(struct trace_seq *s, struct event_format *event, + int size __maybe_unused, struct tep_event_format *event); +void tep_event_info(struct trace_seq *s, struct tep_event_format *event, struct tep_record *record); int tep_strerror(struct tep_handle *pevent, enum tep_errno errnum, char *buf, size_t buflen); -struct event_format **tep_list_events(struct tep_handle *pevent, enum event_sort_type); -struct format_field **tep_event_common_fields(struct event_format *event); -struct format_field **tep_event_fields(struct event_format *event); - -static inline int tep_get_cpus(struct tep_handle *pevent) -{ - return pevent->cpus; -} - -static inline void tep_set_cpus(struct tep_handle *pevent, int cpus) -{ - pevent->cpus = cpus; -} - -static inline int tep_get_long_size(struct tep_handle *pevent) -{ - return pevent->long_size; -} - -static inline void tep_set_long_size(struct tep_handle *pevent, int long_size) -{ - pevent->long_size = long_size; -} - -static inline int tep_get_page_size(struct tep_handle *pevent) -{ - return pevent->page_size; -} - -static inline void tep_set_page_size(struct tep_handle *pevent, int _page_size) -{ - pevent->page_size = _page_size; -} - -static inline int tep_is_file_bigendian(struct tep_handle *pevent) -{ - return pevent->file_bigendian; -} - -static inline void tep_set_file_bigendian(struct tep_handle *pevent, int endian) -{ - pevent->file_bigendian = endian; -} - -static inline int tep_is_host_bigendian(struct tep_handle *pevent) -{ - return pevent->host_bigendian; -} - -static inline void tep_set_host_bigendian(struct tep_handle *pevent, int endian) -{ - pevent->host_bigendian = endian; -} - -static inline int tep_is_latency_format(struct tep_handle *pevent) -{ - return pevent->latency_format; -} - -static inline void tep_set_latency_format(struct tep_handle *pevent, int lat) -{ - pevent->latency_format = lat; -} +struct tep_event_format **tep_list_events(struct tep_handle *pevent, enum tep_event_sort_type); +struct tep_format_field **tep_event_common_fields(struct tep_event_format *event); +struct tep_format_field **tep_event_fields(struct tep_event_format *event); + +enum tep_endian { + TEP_LITTLE_ENDIAN = 0, + TEP_BIG_ENDIAN +}; +int tep_get_cpus(struct tep_handle *pevent); +void tep_set_cpus(struct tep_handle *pevent, int cpus); +int tep_get_long_size(struct tep_handle *pevent); +void tep_set_long_size(struct tep_handle *pevent, int long_size); +int tep_get_page_size(struct tep_handle *pevent); +void tep_set_page_size(struct tep_handle *pevent, int _page_size); +int tep_is_file_bigendian(struct tep_handle *pevent); +void tep_set_file_bigendian(struct tep_handle *pevent, enum tep_endian endian); +int tep_is_host_bigendian(struct tep_handle *pevent); +void tep_set_host_bigendian(struct tep_handle *pevent, enum tep_endian endian); +int tep_is_latency_format(struct tep_handle *pevent); +void tep_set_latency_format(struct tep_handle *pevent, int lat); +int tep_get_header_page_size(struct tep_handle *pevent); struct tep_handle *tep_alloc(void); void tep_free(struct tep_handle *pevent); @@ -798,7 +584,7 @@ void tep_unref(struct tep_handle *pevent); /* access to the internal parser */ void tep_buffer_init(const char *buf, unsigned long long size); -enum event_type tep_read_token(char **tok); +enum tep_event_type tep_read_token(char **tok); void tep_free_token(char *token); int tep_peek_char(void); const char *tep_get_input_buf(void); @@ -810,136 +596,136 @@ void tep_print_printk(struct tep_handle *pevent); /* ----------------------- filtering ----------------------- */ -enum filter_boolean_type { - FILTER_FALSE, - FILTER_TRUE, +enum tep_filter_boolean_type { + TEP_FILTER_FALSE, + TEP_FILTER_TRUE, }; -enum filter_op_type { - FILTER_OP_AND = 1, - FILTER_OP_OR, - FILTER_OP_NOT, +enum tep_filter_op_type { + TEP_FILTER_OP_AND = 1, + TEP_FILTER_OP_OR, + TEP_FILTER_OP_NOT, }; -enum filter_cmp_type { - FILTER_CMP_NONE, - FILTER_CMP_EQ, - FILTER_CMP_NE, - FILTER_CMP_GT, - FILTER_CMP_LT, - FILTER_CMP_GE, - FILTER_CMP_LE, - FILTER_CMP_MATCH, - FILTER_CMP_NOT_MATCH, - FILTER_CMP_REGEX, - FILTER_CMP_NOT_REGEX, +enum tep_filter_cmp_type { + TEP_FILTER_CMP_NONE, + TEP_FILTER_CMP_EQ, + TEP_FILTER_CMP_NE, + TEP_FILTER_CMP_GT, + TEP_FILTER_CMP_LT, + TEP_FILTER_CMP_GE, + TEP_FILTER_CMP_LE, + TEP_FILTER_CMP_MATCH, + TEP_FILTER_CMP_NOT_MATCH, + TEP_FILTER_CMP_REGEX, + TEP_FILTER_CMP_NOT_REGEX, }; -enum filter_exp_type { - FILTER_EXP_NONE, - FILTER_EXP_ADD, - FILTER_EXP_SUB, - FILTER_EXP_MUL, - FILTER_EXP_DIV, - FILTER_EXP_MOD, - FILTER_EXP_RSHIFT, - FILTER_EXP_LSHIFT, - FILTER_EXP_AND, - FILTER_EXP_OR, - FILTER_EXP_XOR, - FILTER_EXP_NOT, +enum tep_filter_exp_type { + TEP_FILTER_EXP_NONE, + TEP_FILTER_EXP_ADD, + TEP_FILTER_EXP_SUB, + TEP_FILTER_EXP_MUL, + TEP_FILTER_EXP_DIV, + TEP_FILTER_EXP_MOD, + TEP_FILTER_EXP_RSHIFT, + TEP_FILTER_EXP_LSHIFT, + TEP_FILTER_EXP_AND, + TEP_FILTER_EXP_OR, + TEP_FILTER_EXP_XOR, + TEP_FILTER_EXP_NOT, }; -enum filter_arg_type { - FILTER_ARG_NONE, - FILTER_ARG_BOOLEAN, - FILTER_ARG_VALUE, - FILTER_ARG_FIELD, - FILTER_ARG_EXP, - FILTER_ARG_OP, - FILTER_ARG_NUM, - FILTER_ARG_STR, +enum tep_filter_arg_type { + TEP_FILTER_ARG_NONE, + TEP_FILTER_ARG_BOOLEAN, + TEP_FILTER_ARG_VALUE, + TEP_FILTER_ARG_FIELD, + TEP_FILTER_ARG_EXP, + TEP_FILTER_ARG_OP, + TEP_FILTER_ARG_NUM, + TEP_FILTER_ARG_STR, }; -enum filter_value_type { - FILTER_NUMBER, - FILTER_STRING, - FILTER_CHAR +enum tep_filter_value_type { + TEP_FILTER_NUMBER, + TEP_FILTER_STRING, + TEP_FILTER_CHAR }; -struct fliter_arg; +struct tep_filter_arg; -struct filter_arg_boolean { - enum filter_boolean_type value; +struct tep_filter_arg_boolean { + enum tep_filter_boolean_type value; }; -struct filter_arg_field { - struct format_field *field; +struct tep_filter_arg_field { + struct tep_format_field *field; }; -struct filter_arg_value { - enum filter_value_type type; +struct tep_filter_arg_value { + enum tep_filter_value_type type; union { char *str; unsigned long long val; }; }; -struct filter_arg_op { - enum filter_op_type type; - struct filter_arg *left; - struct filter_arg *right; +struct tep_filter_arg_op { + enum tep_filter_op_type type; + struct tep_filter_arg *left; + struct tep_filter_arg *right; }; -struct filter_arg_exp { - enum filter_exp_type type; - struct filter_arg *left; - struct filter_arg *right; +struct tep_filter_arg_exp { + enum tep_filter_exp_type type; + struct tep_filter_arg *left; + struct tep_filter_arg *right; }; -struct filter_arg_num { - enum filter_cmp_type type; - struct filter_arg *left; - struct filter_arg *right; +struct tep_filter_arg_num { + enum tep_filter_cmp_type type; + struct tep_filter_arg *left; + struct tep_filter_arg *right; }; -struct filter_arg_str { - enum filter_cmp_type type; - struct format_field *field; - char *val; - char *buffer; - regex_t reg; +struct tep_filter_arg_str { + enum tep_filter_cmp_type type; + struct tep_format_field *field; + char *val; + char *buffer; + regex_t reg; }; -struct filter_arg { - enum filter_arg_type type; +struct tep_filter_arg { + enum tep_filter_arg_type type; union { - struct filter_arg_boolean boolean; - struct filter_arg_field field; - struct filter_arg_value value; - struct filter_arg_op op; - struct filter_arg_exp exp; - struct filter_arg_num num; - struct filter_arg_str str; + struct tep_filter_arg_boolean boolean; + struct tep_filter_arg_field field; + struct tep_filter_arg_value value; + struct tep_filter_arg_op op; + struct tep_filter_arg_exp exp; + struct tep_filter_arg_num num; + struct tep_filter_arg_str str; }; }; -struct filter_type { +struct tep_filter_type { int event_id; - struct event_format *event; - struct filter_arg *filter; + struct tep_event_format *event; + struct tep_filter_arg *filter; }; #define TEP_FILTER_ERROR_BUFSZ 1024 -struct event_filter { +struct tep_event_filter { struct tep_handle *pevent; int filters; - struct filter_type *event_filters; + struct tep_filter_type *event_filters; char error_buffer[TEP_FILTER_ERROR_BUFSZ]; }; -struct event_filter *tep_filter_alloc(struct tep_handle *pevent); +struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent); /* for backward compatibility */ #define FILTER_NONE TEP_ERRNO__NO_FILTER @@ -947,45 +733,45 @@ struct event_filter *tep_filter_alloc(struct tep_handle *pevent); #define FILTER_MISS TEP_ERRNO__FILTER_MISS #define FILTER_MATCH TEP_ERRNO__FILTER_MATCH -enum filter_trivial_type { - FILTER_TRIVIAL_FALSE, - FILTER_TRIVIAL_TRUE, - FILTER_TRIVIAL_BOTH, +enum tep_filter_trivial_type { + TEP_FILTER_TRIVIAL_FALSE, + TEP_FILTER_TRIVIAL_TRUE, + TEP_FILTER_TRIVIAL_BOTH, }; -enum tep_errno tep_filter_add_filter_str(struct event_filter *filter, +enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter, const char *filter_str); -enum tep_errno tep_filter_match(struct event_filter *filter, +enum tep_errno tep_filter_match(struct tep_event_filter *filter, struct tep_record *record); -int tep_filter_strerror(struct event_filter *filter, enum tep_errno err, +int tep_filter_strerror(struct tep_event_filter *filter, enum tep_errno err, char *buf, size_t buflen); -int tep_event_filtered(struct event_filter *filter, +int tep_event_filtered(struct tep_event_filter *filter, int event_id); -void tep_filter_reset(struct event_filter *filter); +void tep_filter_reset(struct tep_event_filter *filter); -int tep_filter_clear_trivial(struct event_filter *filter, - enum filter_trivial_type type); +int tep_filter_clear_trivial(struct tep_event_filter *filter, + enum tep_filter_trivial_type type); -void tep_filter_free(struct event_filter *filter); +void tep_filter_free(struct tep_event_filter *filter); -char *tep_filter_make_string(struct event_filter *filter, int event_id); +char *tep_filter_make_string(struct tep_event_filter *filter, int event_id); -int tep_filter_remove_event(struct event_filter *filter, +int tep_filter_remove_event(struct tep_event_filter *filter, int event_id); -int tep_filter_event_has_trivial(struct event_filter *filter, +int tep_filter_event_has_trivial(struct tep_event_filter *filter, int event_id, - enum filter_trivial_type type); + enum tep_filter_trivial_type type); -int tep_filter_copy(struct event_filter *dest, struct event_filter *source); +int tep_filter_copy(struct tep_event_filter *dest, struct tep_event_filter *source); -int tep_update_trivial(struct event_filter *dest, struct event_filter *source, - enum filter_trivial_type type); +int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter *source, + enum tep_filter_trivial_type type); -int tep_filter_compare(struct event_filter *filter1, struct event_filter *filter2); +int tep_filter_compare(struct tep_event_filter *filter1, struct tep_event_filter *filter2); #endif /* _PARSE_EVENTS_H */ diff --git a/tools/lib/traceevent/event-plugin.c b/tools/lib/traceevent/event-plugin.c index f17e25097e1e..e74f16c88398 100644 --- a/tools/lib/traceevent/event-plugin.c +++ b/tools/lib/traceevent/event-plugin.c @@ -14,7 +14,9 @@ #include #include #include "event-parse.h" +#include "event-parse-local.h" #include "event-utils.h" +#include "trace-seq.h" #define LOCAL_PLUGIN_DIR ".traceevent/plugins" @@ -30,8 +32,8 @@ static struct trace_plugin_options { char *value; } *trace_plugin_options; -struct plugin_list { - struct plugin_list *next; +struct tep_plugin_list { + struct tep_plugin_list *next; char *name; void *handle; }; @@ -258,7 +260,7 @@ void tep_plugin_remove_options(struct tep_plugin_option *options) */ void tep_print_plugins(struct trace_seq *s, const char *prefix, const char *suffix, - const struct plugin_list *list) + const struct tep_plugin_list *list) { while (list) { trace_seq_printf(s, "%s%s%s", prefix, list->name, suffix); @@ -270,9 +272,9 @@ static void load_plugin(struct tep_handle *pevent, const char *path, const char *file, void *data) { - struct plugin_list **plugin_list = data; + struct tep_plugin_list **plugin_list = data; tep_plugin_load_func func; - struct plugin_list *list; + struct tep_plugin_list *list; const char *alias; char *plugin; void *handle; @@ -416,20 +418,20 @@ load_plugins(struct tep_handle *pevent, const char *suffix, free(path); } -struct plugin_list* +struct tep_plugin_list* tep_load_plugins(struct tep_handle *pevent) { - struct plugin_list *list = NULL; + struct tep_plugin_list *list = NULL; load_plugins(pevent, ".so", load_plugin, &list); return list; } void -tep_unload_plugins(struct plugin_list *plugin_list, struct tep_handle *pevent) +tep_unload_plugins(struct tep_plugin_list *plugin_list, struct tep_handle *pevent) { tep_plugin_unload_func func; - struct plugin_list *list; + struct tep_plugin_list *list; while (plugin_list) { list = plugin_list; diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index e76154c02ee7..ed87cb56713d 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -11,22 +11,23 @@ #include #include "event-parse.h" +#include "event-parse-local.h" #include "event-utils.h" #define COMM "COMM" #define CPU "CPU" -static struct format_field comm = { +static struct tep_format_field comm = { .name = "COMM", }; -static struct format_field cpu = { +static struct tep_format_field cpu = { .name = "CPU", }; struct event_list { struct event_list *next; - struct event_format *event; + struct tep_event_format *event; }; static void show_error(char *error_buf, const char *fmt, ...) @@ -61,15 +62,15 @@ static void free_token(char *token) tep_free_token(token); } -static enum event_type read_token(char **tok) +static enum tep_event_type read_token(char **tok) { - enum event_type type; + enum tep_event_type type; char *token = NULL; do { free_token(token); type = tep_read_token(&token); - } while (type == EVENT_NEWLINE || type == EVENT_SPACE); + } while (type == TEP_EVENT_NEWLINE || type == TEP_EVENT_SPACE); /* If token is = or ! check to see if the next char is ~ */ if (token && @@ -79,7 +80,7 @@ static enum event_type read_token(char **tok) *tok = malloc(3); if (*tok == NULL) { free_token(token); - return EVENT_ERROR; + return TEP_EVENT_ERROR; } sprintf(*tok, "%c%c", *token, '~'); free_token(token); @@ -94,8 +95,8 @@ static enum event_type read_token(char **tok) static int filter_cmp(const void *a, const void *b) { - const struct filter_type *ea = a; - const struct filter_type *eb = b; + const struct tep_filter_type *ea = a; + const struct tep_filter_type *eb = b; if (ea->event_id < eb->event_id) return -1; @@ -106,11 +107,11 @@ static int filter_cmp(const void *a, const void *b) return 0; } -static struct filter_type * -find_filter_type(struct event_filter *filter, int id) +static struct tep_filter_type * +find_filter_type(struct tep_event_filter *filter, int id) { - struct filter_type *filter_type; - struct filter_type key; + struct tep_filter_type *filter_type; + struct tep_filter_type key; key.event_id = id; @@ -122,10 +123,10 @@ find_filter_type(struct event_filter *filter, int id) return filter_type; } -static struct filter_type * -add_filter_type(struct event_filter *filter, int id) +static struct tep_filter_type * +add_filter_type(struct tep_event_filter *filter, int id) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; int i; filter_type = find_filter_type(filter, id); @@ -165,9 +166,9 @@ add_filter_type(struct event_filter *filter, int id) * tep_filter_alloc - create a new event filter * @pevent: The pevent that this filter is associated with */ -struct event_filter *tep_filter_alloc(struct tep_handle *pevent) +struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent) { - struct event_filter *filter; + struct tep_event_filter *filter; filter = malloc(sizeof(*filter)); if (filter == NULL) @@ -180,44 +181,44 @@ struct event_filter *tep_filter_alloc(struct tep_handle *pevent) return filter; } -static struct filter_arg *allocate_arg(void) +static struct tep_filter_arg *allocate_arg(void) { - return calloc(1, sizeof(struct filter_arg)); + return calloc(1, sizeof(struct tep_filter_arg)); } -static void free_arg(struct filter_arg *arg) +static void free_arg(struct tep_filter_arg *arg) { if (!arg) return; switch (arg->type) { - case FILTER_ARG_NONE: - case FILTER_ARG_BOOLEAN: + case TEP_FILTER_ARG_NONE: + case TEP_FILTER_ARG_BOOLEAN: break; - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: free_arg(arg->num.left); free_arg(arg->num.right); break; - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: free_arg(arg->exp.left); free_arg(arg->exp.right); break; - case FILTER_ARG_STR: + case TEP_FILTER_ARG_STR: free(arg->str.val); regfree(&arg->str.reg); free(arg->str.buffer); break; - case FILTER_ARG_VALUE: - if (arg->value.type == FILTER_STRING || - arg->value.type == FILTER_CHAR) + case TEP_FILTER_ARG_VALUE: + if (arg->value.type == TEP_FILTER_STRING || + arg->value.type == TEP_FILTER_CHAR) free(arg->value.str); break; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: free_arg(arg->op.left); free_arg(arg->op.right); default: @@ -228,7 +229,7 @@ static void free_arg(struct filter_arg *arg) } static int add_event(struct event_list **events, - struct event_format *event) + struct tep_event_format *event) { struct event_list *list; @@ -242,7 +243,7 @@ static int add_event(struct event_list **events, return 0; } -static int event_match(struct event_format *event, +static int event_match(struct tep_event_format *event, regex_t *sreg, regex_t *ereg) { if (sreg) { @@ -258,7 +259,7 @@ static enum tep_errno find_event(struct tep_handle *pevent, struct event_list **events, char *sys_name, char *event_name) { - struct event_format *event; + struct tep_event_format *event; regex_t ereg; regex_t sreg; int match = 0; @@ -333,11 +334,11 @@ static void free_events(struct event_list *events) } static enum tep_errno -create_arg_item(struct event_format *event, const char *token, - enum event_type type, struct filter_arg **parg, char *error_str) +create_arg_item(struct tep_event_format *event, const char *token, + enum tep_event_type type, struct tep_filter_arg **parg, char *error_str) { - struct format_field *field; - struct filter_arg *arg; + struct tep_format_field *field; + struct tep_filter_arg *arg; arg = allocate_arg(); if (arg == NULL) { @@ -347,11 +348,11 @@ create_arg_item(struct event_format *event, const char *token, switch (type) { - case EVENT_SQUOTE: - case EVENT_DQUOTE: - arg->type = FILTER_ARG_VALUE; + case TEP_EVENT_SQUOTE: + case TEP_EVENT_DQUOTE: + arg->type = TEP_FILTER_ARG_VALUE; arg->value.type = - type == EVENT_DQUOTE ? FILTER_STRING : FILTER_CHAR; + type == TEP_EVENT_DQUOTE ? TEP_FILTER_STRING : TEP_FILTER_CHAR; arg->value.str = strdup(token); if (!arg->value.str) { free_arg(arg); @@ -359,11 +360,11 @@ create_arg_item(struct event_format *event, const char *token, return TEP_ERRNO__MEM_ALLOC_FAILED; } break; - case EVENT_ITEM: + case TEP_EVENT_ITEM: /* if it is a number, then convert it */ if (isdigit(token[0])) { - arg->type = FILTER_ARG_VALUE; - arg->value.type = FILTER_NUMBER; + arg->type = TEP_FILTER_ARG_VALUE; + arg->value.type = TEP_FILTER_NUMBER; arg->value.val = strtoull(token, NULL, 0); break; } @@ -377,12 +378,12 @@ create_arg_item(struct event_format *event, const char *token, field = &cpu; } else { /* not a field, Make it false */ - arg->type = FILTER_ARG_BOOLEAN; - arg->boolean.value = FILTER_FALSE; + arg->type = TEP_FILTER_ARG_BOOLEAN; + arg->boolean.value = TEP_FILTER_FALSE; break; } } - arg->type = FILTER_ARG_FIELD; + arg->type = TEP_FILTER_ARG_FIELD; arg->field.field = field; break; default: @@ -394,82 +395,82 @@ create_arg_item(struct event_format *event, const char *token, return 0; } -static struct filter_arg * -create_arg_op(enum filter_op_type btype) +static struct tep_filter_arg * +create_arg_op(enum tep_filter_op_type btype) { - struct filter_arg *arg; + struct tep_filter_arg *arg; arg = allocate_arg(); if (!arg) return NULL; - arg->type = FILTER_ARG_OP; + arg->type = TEP_FILTER_ARG_OP; arg->op.type = btype; return arg; } -static struct filter_arg * -create_arg_exp(enum filter_exp_type etype) +static struct tep_filter_arg * +create_arg_exp(enum tep_filter_exp_type etype) { - struct filter_arg *arg; + struct tep_filter_arg *arg; arg = allocate_arg(); if (!arg) return NULL; - arg->type = FILTER_ARG_EXP; + arg->type = TEP_FILTER_ARG_EXP; arg->exp.type = etype; return arg; } -static struct filter_arg * -create_arg_cmp(enum filter_cmp_type ctype) +static struct tep_filter_arg * +create_arg_cmp(enum tep_filter_cmp_type ctype) { - struct filter_arg *arg; + struct tep_filter_arg *arg; arg = allocate_arg(); if (!arg) return NULL; /* Use NUM and change if necessary */ - arg->type = FILTER_ARG_NUM; + arg->type = TEP_FILTER_ARG_NUM; arg->num.type = ctype; return arg; } static enum tep_errno -add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) +add_right(struct tep_filter_arg *op, struct tep_filter_arg *arg, char *error_str) { - struct filter_arg *left; + struct tep_filter_arg *left; char *str; int op_type; int ret; switch (op->type) { - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: if (op->exp.right) goto out_fail; op->exp.right = arg; break; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: if (op->op.right) goto out_fail; op->op.right = arg; break; - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: if (op->op.right) goto out_fail; /* * The arg must be num, str, or field */ switch (arg->type) { - case FILTER_ARG_VALUE: - case FILTER_ARG_FIELD: + case TEP_FILTER_ARG_VALUE: + case TEP_FILTER_ARG_FIELD: break; default: show_error(error_str, "Illegal rvalue"); @@ -481,20 +482,20 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) * convert this to a string or regex. */ switch (arg->value.type) { - case FILTER_CHAR: + case TEP_FILTER_CHAR: /* * A char should be converted to number if * the string is 1 byte, and the compare * is not a REGEX. */ if (strlen(arg->value.str) == 1 && - op->num.type != FILTER_CMP_REGEX && - op->num.type != FILTER_CMP_NOT_REGEX) { - arg->value.type = FILTER_NUMBER; + op->num.type != TEP_FILTER_CMP_REGEX && + op->num.type != TEP_FILTER_CMP_NOT_REGEX) { + arg->value.type = TEP_FILTER_NUMBER; goto do_int; } /* fall through */ - case FILTER_STRING: + case TEP_FILTER_STRING: /* convert op to a string arg */ op_type = op->num.type; @@ -508,16 +509,16 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) * If left arg was a field not found then * NULL the entire op. */ - if (left->type == FILTER_ARG_BOOLEAN) { + if (left->type == TEP_FILTER_ARG_BOOLEAN) { free_arg(left); free_arg(arg); - op->type = FILTER_ARG_BOOLEAN; - op->boolean.value = FILTER_FALSE; + op->type = TEP_FILTER_ARG_BOOLEAN; + op->boolean.value = TEP_FILTER_FALSE; break; } /* Left arg must be a field */ - if (left->type != FILTER_ARG_FIELD) { + if (left->type != TEP_FILTER_ARG_FIELD) { show_error(error_str, "Illegal lvalue for string comparison"); return TEP_ERRNO__ILLEGAL_LVALUE; @@ -525,15 +526,15 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) /* Make sure this is a valid string compare */ switch (op_type) { - case FILTER_CMP_EQ: - op_type = FILTER_CMP_MATCH; + case TEP_FILTER_CMP_EQ: + op_type = TEP_FILTER_CMP_MATCH; break; - case FILTER_CMP_NE: - op_type = FILTER_CMP_NOT_MATCH; + case TEP_FILTER_CMP_NE: + op_type = TEP_FILTER_CMP_NOT_MATCH; break; - case FILTER_CMP_REGEX: - case FILTER_CMP_NOT_REGEX: + case TEP_FILTER_CMP_REGEX: + case TEP_FILTER_CMP_NOT_REGEX: ret = regcomp(&op->str.reg, str, REG_ICASE|REG_NOSUB); if (ret) { show_error(error_str, @@ -548,7 +549,7 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) return TEP_ERRNO__ILLEGAL_STRING_CMP; } - op->type = FILTER_ARG_STR; + op->type = TEP_FILTER_ARG_STR; op->str.type = op_type; op->str.field = left->field.field; op->str.val = strdup(str); @@ -573,12 +574,12 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) break; - case FILTER_NUMBER: + case TEP_FILTER_NUMBER: do_int: switch (op->num.type) { - case FILTER_CMP_REGEX: - case FILTER_CMP_NOT_REGEX: + case TEP_FILTER_CMP_REGEX: + case TEP_FILTER_CMP_NOT_REGEX: show_error(error_str, "Op not allowed with integers"); return TEP_ERRNO__ILLEGAL_INTEGER_CMP; @@ -605,35 +606,35 @@ add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str) return TEP_ERRNO__SYNTAX_ERROR; } -static struct filter_arg * -rotate_op_right(struct filter_arg *a, struct filter_arg *b) +static struct tep_filter_arg * +rotate_op_right(struct tep_filter_arg *a, struct tep_filter_arg *b) { - struct filter_arg *arg; + struct tep_filter_arg *arg; arg = a->op.right; a->op.right = b; return arg; } -static enum tep_errno add_left(struct filter_arg *op, struct filter_arg *arg) +static enum tep_errno add_left(struct tep_filter_arg *op, struct tep_filter_arg *arg) { switch (op->type) { - case FILTER_ARG_EXP: - if (arg->type == FILTER_ARG_OP) + case TEP_FILTER_ARG_EXP: + if (arg->type == TEP_FILTER_ARG_OP) arg = rotate_op_right(arg, op); op->exp.left = arg; break; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: op->op.left = arg; break; - case FILTER_ARG_NUM: - if (arg->type == FILTER_ARG_OP) + case TEP_FILTER_ARG_NUM: + if (arg->type == TEP_FILTER_ARG_OP) arg = rotate_op_right(arg, op); /* left arg of compares must be a field */ - if (arg->type != FILTER_ARG_FIELD && - arg->type != FILTER_ARG_BOOLEAN) + if (arg->type != TEP_FILTER_ARG_FIELD && + arg->type != TEP_FILTER_ARG_BOOLEAN) return TEP_ERRNO__INVALID_ARG_TYPE; op->num.left = arg; break; @@ -652,91 +653,91 @@ enum op_type { }; static enum op_type process_op(const char *token, - enum filter_op_type *btype, - enum filter_cmp_type *ctype, - enum filter_exp_type *etype) + enum tep_filter_op_type *btype, + enum tep_filter_cmp_type *ctype, + enum tep_filter_exp_type *etype) { - *btype = FILTER_OP_NOT; - *etype = FILTER_EXP_NONE; - *ctype = FILTER_CMP_NONE; + *btype = TEP_FILTER_OP_NOT; + *etype = TEP_FILTER_EXP_NONE; + *ctype = TEP_FILTER_CMP_NONE; if (strcmp(token, "&&") == 0) - *btype = FILTER_OP_AND; + *btype = TEP_FILTER_OP_AND; else if (strcmp(token, "||") == 0) - *btype = FILTER_OP_OR; + *btype = TEP_FILTER_OP_OR; else if (strcmp(token, "!") == 0) return OP_NOT; - if (*btype != FILTER_OP_NOT) + if (*btype != TEP_FILTER_OP_NOT) return OP_BOOL; /* Check for value expressions */ if (strcmp(token, "+") == 0) { - *etype = FILTER_EXP_ADD; + *etype = TEP_FILTER_EXP_ADD; } else if (strcmp(token, "-") == 0) { - *etype = FILTER_EXP_SUB; + *etype = TEP_FILTER_EXP_SUB; } else if (strcmp(token, "*") == 0) { - *etype = FILTER_EXP_MUL; + *etype = TEP_FILTER_EXP_MUL; } else if (strcmp(token, "/") == 0) { - *etype = FILTER_EXP_DIV; + *etype = TEP_FILTER_EXP_DIV; } else if (strcmp(token, "%") == 0) { - *etype = FILTER_EXP_MOD; + *etype = TEP_FILTER_EXP_MOD; } else if (strcmp(token, ">>") == 0) { - *etype = FILTER_EXP_RSHIFT; + *etype = TEP_FILTER_EXP_RSHIFT; } else if (strcmp(token, "<<") == 0) { - *etype = FILTER_EXP_LSHIFT; + *etype = TEP_FILTER_EXP_LSHIFT; } else if (strcmp(token, "&") == 0) { - *etype = FILTER_EXP_AND; + *etype = TEP_FILTER_EXP_AND; } else if (strcmp(token, "|") == 0) { - *etype = FILTER_EXP_OR; + *etype = TEP_FILTER_EXP_OR; } else if (strcmp(token, "^") == 0) { - *etype = FILTER_EXP_XOR; + *etype = TEP_FILTER_EXP_XOR; } else if (strcmp(token, "~") == 0) - *etype = FILTER_EXP_NOT; + *etype = TEP_FILTER_EXP_NOT; - if (*etype != FILTER_EXP_NONE) + if (*etype != TEP_FILTER_EXP_NONE) return OP_EXP; /* Check for compares */ if (strcmp(token, "==") == 0) - *ctype = FILTER_CMP_EQ; + *ctype = TEP_FILTER_CMP_EQ; else if (strcmp(token, "!=") == 0) - *ctype = FILTER_CMP_NE; + *ctype = TEP_FILTER_CMP_NE; else if (strcmp(token, "<") == 0) - *ctype = FILTER_CMP_LT; + *ctype = TEP_FILTER_CMP_LT; else if (strcmp(token, ">") == 0) - *ctype = FILTER_CMP_GT; + *ctype = TEP_FILTER_CMP_GT; else if (strcmp(token, "<=") == 0) - *ctype = FILTER_CMP_LE; + *ctype = TEP_FILTER_CMP_LE; else if (strcmp(token, ">=") == 0) - *ctype = FILTER_CMP_GE; + *ctype = TEP_FILTER_CMP_GE; else if (strcmp(token, "=~") == 0) - *ctype = FILTER_CMP_REGEX; + *ctype = TEP_FILTER_CMP_REGEX; else if (strcmp(token, "!~") == 0) - *ctype = FILTER_CMP_NOT_REGEX; + *ctype = TEP_FILTER_CMP_NOT_REGEX; else return OP_NONE; return OP_CMP; } -static int check_op_done(struct filter_arg *arg) +static int check_op_done(struct tep_filter_arg *arg) { switch (arg->type) { - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: return arg->exp.right != NULL; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: return arg->op.right != NULL; - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: return arg->num.right != NULL; - case FILTER_ARG_STR: + case TEP_FILTER_ARG_STR: /* A string conversion is always done */ return 1; - case FILTER_ARG_BOOLEAN: + case TEP_FILTER_ARG_BOOLEAN: /* field not found, is ok */ return 1; @@ -752,14 +753,14 @@ enum filter_vals { }; static enum tep_errno -reparent_op_arg(struct filter_arg *parent, struct filter_arg *old_child, - struct filter_arg *arg, char *error_str) +reparent_op_arg(struct tep_filter_arg *parent, struct tep_filter_arg *old_child, + struct tep_filter_arg *arg, char *error_str) { - struct filter_arg *other_child; - struct filter_arg **ptr; + struct tep_filter_arg *other_child; + struct tep_filter_arg **ptr; - if (parent->type != FILTER_ARG_OP && - arg->type != FILTER_ARG_OP) { + if (parent->type != TEP_FILTER_ARG_OP && + arg->type != TEP_FILTER_ARG_OP) { show_error(error_str, "can not reparent other than OP"); return TEP_ERRNO__REPARENT_NOT_OP; } @@ -804,7 +805,7 @@ reparent_op_arg(struct filter_arg *parent, struct filter_arg *old_child, } /* Returns either filter_vals (success) or tep_errno (failfure) */ -static int test_arg(struct filter_arg *parent, struct filter_arg *arg, +static int test_arg(struct tep_filter_arg *parent, struct tep_filter_arg *arg, char *error_str) { int lval, rval; @@ -812,16 +813,16 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, switch (arg->type) { /* bad case */ - case FILTER_ARG_BOOLEAN: + case TEP_FILTER_ARG_BOOLEAN: return FILTER_VAL_FALSE + arg->boolean.value; /* good cases: */ - case FILTER_ARG_STR: - case FILTER_ARG_VALUE: - case FILTER_ARG_FIELD: + case TEP_FILTER_ARG_STR: + case TEP_FILTER_ARG_VALUE: + case TEP_FILTER_ARG_FIELD: return FILTER_VAL_NORM; - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: lval = test_arg(arg, arg->exp.left, error_str); if (lval != FILTER_VAL_NORM) return lval; @@ -830,7 +831,7 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, return rval; return FILTER_VAL_NORM; - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: lval = test_arg(arg, arg->num.left, error_str); if (lval != FILTER_VAL_NORM) return lval; @@ -839,14 +840,14 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, return rval; return FILTER_VAL_NORM; - case FILTER_ARG_OP: - if (arg->op.type != FILTER_OP_NOT) { + case TEP_FILTER_ARG_OP: + if (arg->op.type != TEP_FILTER_OP_NOT) { lval = test_arg(arg, arg->op.left, error_str); switch (lval) { case FILTER_VAL_NORM: break; case FILTER_VAL_TRUE: - if (arg->op.type == FILTER_OP_OR) + if (arg->op.type == TEP_FILTER_OP_OR) return FILTER_VAL_TRUE; rval = test_arg(arg, arg->op.right, error_str); if (rval != FILTER_VAL_NORM) @@ -856,7 +857,7 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, error_str); case FILTER_VAL_FALSE: - if (arg->op.type == FILTER_OP_AND) + if (arg->op.type == TEP_FILTER_OP_AND) return FILTER_VAL_FALSE; rval = test_arg(arg, arg->op.right, error_str); if (rval != FILTER_VAL_NORM) @@ -877,18 +878,18 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, break; case FILTER_VAL_TRUE: - if (arg->op.type == FILTER_OP_OR) + if (arg->op.type == TEP_FILTER_OP_OR) return FILTER_VAL_TRUE; - if (arg->op.type == FILTER_OP_NOT) + if (arg->op.type == TEP_FILTER_OP_NOT) return FILTER_VAL_FALSE; return reparent_op_arg(parent, arg, arg->op.left, error_str); case FILTER_VAL_FALSE: - if (arg->op.type == FILTER_OP_AND) + if (arg->op.type == TEP_FILTER_OP_AND) return FILTER_VAL_FALSE; - if (arg->op.type == FILTER_OP_NOT) + if (arg->op.type == TEP_FILTER_OP_NOT) return FILTER_VAL_TRUE; return reparent_op_arg(parent, arg, arg->op.left, @@ -904,8 +905,8 @@ static int test_arg(struct filter_arg *parent, struct filter_arg *arg, } /* Remove any unknown event fields */ -static int collapse_tree(struct filter_arg *arg, - struct filter_arg **arg_collapsed, char *error_str) +static int collapse_tree(struct tep_filter_arg *arg, + struct tep_filter_arg **arg_collapsed, char *error_str) { int ret; @@ -919,7 +920,7 @@ static int collapse_tree(struct filter_arg *arg, free_arg(arg); arg = allocate_arg(); if (arg) { - arg->type = FILTER_ARG_BOOLEAN; + arg->type = TEP_FILTER_ARG_BOOLEAN; arg->boolean.value = ret == FILTER_VAL_TRUE; } else { show_error(error_str, "Failed to allocate filter arg"); @@ -939,19 +940,19 @@ static int collapse_tree(struct filter_arg *arg, } static enum tep_errno -process_filter(struct event_format *event, struct filter_arg **parg, +process_filter(struct tep_event_format *event, struct tep_filter_arg **parg, char *error_str, int not) { - enum event_type type; + enum tep_event_type type; char *token = NULL; - struct filter_arg *current_op = NULL; - struct filter_arg *current_exp = NULL; - struct filter_arg *left_item = NULL; - struct filter_arg *arg = NULL; + struct tep_filter_arg *current_op = NULL; + struct tep_filter_arg *current_exp = NULL; + struct tep_filter_arg *left_item = NULL; + struct tep_filter_arg *arg = NULL; enum op_type op_type; - enum filter_op_type btype; - enum filter_exp_type etype; - enum filter_cmp_type ctype; + enum tep_filter_op_type btype; + enum tep_filter_exp_type etype; + enum tep_filter_cmp_type ctype; enum tep_errno ret; *parg = NULL; @@ -960,9 +961,9 @@ process_filter(struct event_format *event, struct filter_arg **parg, free(token); type = read_token(&token); switch (type) { - case EVENT_SQUOTE: - case EVENT_DQUOTE: - case EVENT_ITEM: + case TEP_EVENT_SQUOTE: + case TEP_EVENT_DQUOTE: + case TEP_EVENT_ITEM: ret = create_arg_item(event, token, type, &arg, error_str); if (ret < 0) goto fail; @@ -987,7 +988,7 @@ process_filter(struct event_format *event, struct filter_arg **parg, arg = NULL; break; - case EVENT_DELIM: + case TEP_EVENT_DELIM: if (*token == ',') { show_error(error_str, "Illegal token ','"); ret = TEP_ERRNO__ILLEGAL_TOKEN; @@ -1054,7 +1055,7 @@ process_filter(struct event_format *event, struct filter_arg **parg, } break; - case EVENT_OP: + case TEP_EVENT_OP: op_type = process_op(token, &btype, &ctype, &etype); /* All expect a left arg except for NOT */ @@ -1139,14 +1140,14 @@ process_filter(struct event_format *event, struct filter_arg **parg, if (ret < 0) goto fail_syntax; break; - case EVENT_NONE: + case TEP_EVENT_NONE: break; - case EVENT_ERROR: + case TEP_EVENT_ERROR: goto fail_alloc; default: goto fail_syntax; } - } while (type != EVENT_NONE); + } while (type != TEP_EVENT_NONE); if (!current_op && !current_exp) goto fail_syntax; @@ -1179,8 +1180,8 @@ process_filter(struct event_format *event, struct filter_arg **parg, } static enum tep_errno -process_event(struct event_format *event, const char *filter_str, - struct filter_arg **parg, char *error_str) +process_event(struct tep_event_format *event, const char *filter_str, + struct tep_filter_arg **parg, char *error_str) { int ret; @@ -1196,19 +1197,19 @@ process_event(struct event_format *event, const char *filter_str, if (*parg == NULL) return TEP_ERRNO__MEM_ALLOC_FAILED; - (*parg)->type = FILTER_ARG_BOOLEAN; - (*parg)->boolean.value = FILTER_FALSE; + (*parg)->type = TEP_FILTER_ARG_BOOLEAN; + (*parg)->boolean.value = TEP_FILTER_FALSE; } return 0; } static enum tep_errno -filter_event(struct event_filter *filter, struct event_format *event, +filter_event(struct tep_event_filter *filter, struct tep_event_format *event, const char *filter_str, char *error_str) { - struct filter_type *filter_type; - struct filter_arg *arg; + struct tep_filter_type *filter_type; + struct tep_filter_arg *arg; enum tep_errno ret; if (filter_str) { @@ -1222,8 +1223,8 @@ filter_event(struct event_filter *filter, struct event_format *event, if (arg == NULL) return TEP_ERRNO__MEM_ALLOC_FAILED; - arg->type = FILTER_ARG_BOOLEAN; - arg->boolean.value = FILTER_TRUE; + arg->type = TEP_FILTER_ARG_BOOLEAN; + arg->boolean.value = TEP_FILTER_TRUE; } filter_type = add_filter_type(filter, event->id); @@ -1237,7 +1238,7 @@ filter_event(struct event_filter *filter, struct event_format *event, return 0; } -static void filter_init_error_buf(struct event_filter *filter) +static void filter_init_error_buf(struct tep_event_filter *filter) { /* clear buffer to reset show error */ tep_buffer_init("", 0); @@ -1253,7 +1254,7 @@ static void filter_init_error_buf(struct event_filter *filter) * negative error code. Use tep_filter_strerror() to see * actual error message in case of error. */ -enum tep_errno tep_filter_add_filter_str(struct event_filter *filter, +enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter, const char *filter_str) { struct tep_handle *pevent = filter->pevent; @@ -1351,7 +1352,7 @@ enum tep_errno tep_filter_add_filter_str(struct event_filter *filter, return rtn; } -static void free_filter_type(struct filter_type *filter_type) +static void free_filter_type(struct tep_filter_type *filter_type) { free_arg(filter_type->filter); } @@ -1365,7 +1366,7 @@ static void free_filter_type(struct filter_type *filter_type) * * Returns 0 if message was filled successfully, -1 if error */ -int tep_filter_strerror(struct event_filter *filter, enum tep_errno err, +int tep_filter_strerror(struct tep_event_filter *filter, enum tep_errno err, char *buf, size_t buflen) { if (err <= __TEP_ERRNO__START || err >= __TEP_ERRNO__END) @@ -1393,10 +1394,10 @@ int tep_filter_strerror(struct event_filter *filter, enum tep_errno err, * Returns 1: if an event was removed * 0: if the event was not found */ -int tep_filter_remove_event(struct event_filter *filter, +int tep_filter_remove_event(struct tep_event_filter *filter, int event_id) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; unsigned long len; if (!filter->filters) @@ -1428,7 +1429,7 @@ int tep_filter_remove_event(struct event_filter *filter, * * Removes all filters from a filter and resets it. */ -void tep_filter_reset(struct event_filter *filter) +void tep_filter_reset(struct tep_event_filter *filter) { int i; @@ -1440,7 +1441,7 @@ void tep_filter_reset(struct event_filter *filter) filter->event_filters = NULL; } -void tep_filter_free(struct event_filter *filter) +void tep_filter_free(struct tep_event_filter *filter) { tep_unref(filter->pevent); @@ -1449,14 +1450,14 @@ void tep_filter_free(struct event_filter *filter) free(filter); } -static char *arg_to_str(struct event_filter *filter, struct filter_arg *arg); +static char *arg_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg); -static int copy_filter_type(struct event_filter *filter, - struct event_filter *source, - struct filter_type *filter_type) +static int copy_filter_type(struct tep_event_filter *filter, + struct tep_event_filter *source, + struct tep_filter_type *filter_type) { - struct filter_arg *arg; - struct event_format *event; + struct tep_filter_arg *arg; + struct tep_event_format *event; const char *sys; const char *name; char *str; @@ -1478,7 +1479,7 @@ static int copy_filter_type(struct event_filter *filter, if (arg == NULL) return -1; - arg->type = FILTER_ARG_BOOLEAN; + arg->type = TEP_FILTER_ARG_BOOLEAN; if (strcmp(str, "TRUE") == 0) arg->boolean.value = 1; else @@ -1507,7 +1508,7 @@ static int copy_filter_type(struct event_filter *filter, * * Returns 0 on success and -1 if not all filters were copied */ -int tep_filter_copy(struct event_filter *dest, struct event_filter *source) +int tep_filter_copy(struct tep_event_filter *dest, struct tep_event_filter *source) { int ret = 0; int i; @@ -1533,14 +1534,14 @@ int tep_filter_copy(struct event_filter *dest, struct event_filter *source) * Returns 0 on success and -1 if there was a problem updating, but * events may have still been updated on error. */ -int tep_update_trivial(struct event_filter *dest, struct event_filter *source, - enum filter_trivial_type type) +int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter *source, + enum tep_filter_trivial_type type) { struct tep_handle *src_pevent; struct tep_handle *dest_pevent; - struct event_format *event; - struct filter_type *filter_type; - struct filter_arg *arg; + struct tep_event_format *event; + struct tep_filter_type *filter_type; + struct tep_filter_arg *arg; char *str; int i; @@ -1554,10 +1555,10 @@ int tep_update_trivial(struct event_filter *dest, struct event_filter *source, for (i = 0; i < dest->filters; i++) { filter_type = &dest->event_filters[i]; arg = filter_type->filter; - if (arg->type != FILTER_ARG_BOOLEAN) + if (arg->type != TEP_FILTER_ARG_BOOLEAN) continue; - if ((arg->boolean.value && type == FILTER_TRIVIAL_FALSE) || - (!arg->boolean.value && type == FILTER_TRIVIAL_TRUE)) + if ((arg->boolean.value && type == TEP_FILTER_TRIVIAL_FALSE) || + (!arg->boolean.value && type == TEP_FILTER_TRIVIAL_TRUE)) continue; event = filter_type->event; @@ -1592,10 +1593,10 @@ int tep_update_trivial(struct event_filter *dest, struct event_filter *source, * * Returns 0 on success and -1 if there was a problem. */ -int tep_filter_clear_trivial(struct event_filter *filter, - enum filter_trivial_type type) +int tep_filter_clear_trivial(struct tep_event_filter *filter, + enum tep_filter_trivial_type type) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; int count = 0; int *ids = NULL; int i; @@ -1611,14 +1612,14 @@ int tep_filter_clear_trivial(struct event_filter *filter, int *new_ids; filter_type = &filter->event_filters[i]; - if (filter_type->filter->type != FILTER_ARG_BOOLEAN) + if (filter_type->filter->type != TEP_FILTER_ARG_BOOLEAN) continue; switch (type) { - case FILTER_TRIVIAL_FALSE: + case TEP_FILTER_TRIVIAL_FALSE: if (filter_type->filter->boolean.value) continue; break; - case FILTER_TRIVIAL_TRUE: + case TEP_FILTER_TRIVIAL_TRUE: if (!filter_type->filter->boolean.value) continue; default: @@ -1654,11 +1655,11 @@ int tep_filter_clear_trivial(struct event_filter *filter, * Returns 1 if the event contains a matching trivial type * otherwise 0. */ -int tep_filter_event_has_trivial(struct event_filter *filter, +int tep_filter_event_has_trivial(struct tep_event_filter *filter, int event_id, - enum filter_trivial_type type) + enum tep_filter_trivial_type type) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; if (!filter->filters) return 0; @@ -1668,25 +1669,25 @@ int tep_filter_event_has_trivial(struct event_filter *filter, if (!filter_type) return 0; - if (filter_type->filter->type != FILTER_ARG_BOOLEAN) + if (filter_type->filter->type != TEP_FILTER_ARG_BOOLEAN) return 0; switch (type) { - case FILTER_TRIVIAL_FALSE: + case TEP_FILTER_TRIVIAL_FALSE: return !filter_type->filter->boolean.value; - case FILTER_TRIVIAL_TRUE: + case TEP_FILTER_TRIVIAL_TRUE: return filter_type->filter->boolean.value; default: return 1; } } -static int test_filter(struct event_format *event, struct filter_arg *arg, +static int test_filter(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err); static const char * -get_comm(struct event_format *event, struct tep_record *record) +get_comm(struct tep_event_format *event, struct tep_record *record) { const char *comm; int pid; @@ -1697,8 +1698,8 @@ get_comm(struct event_format *event, struct tep_record *record) } static unsigned long long -get_value(struct event_format *event, - struct format_field *field, struct tep_record *record) +get_value(struct tep_event_format *event, + struct tep_format_field *field, struct tep_record *record) { unsigned long long val; @@ -1716,7 +1717,7 @@ get_value(struct event_format *event, tep_read_number_field(field, record->data, &val); - if (!(field->flags & FIELD_IS_SIGNED)) + if (!(field->flags & TEP_FIELD_IS_SIGNED)) return val; switch (field->size) { @@ -1733,11 +1734,11 @@ get_value(struct event_format *event, } static unsigned long long -get_arg_value(struct event_format *event, struct filter_arg *arg, +get_arg_value(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err); static unsigned long long -get_exp_value(struct event_format *event, struct filter_arg *arg, +get_exp_value(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { unsigned long long lval, rval; @@ -1753,37 +1754,37 @@ get_exp_value(struct event_format *event, struct filter_arg *arg, } switch (arg->exp.type) { - case FILTER_EXP_ADD: + case TEP_FILTER_EXP_ADD: return lval + rval; - case FILTER_EXP_SUB: + case TEP_FILTER_EXP_SUB: return lval - rval; - case FILTER_EXP_MUL: + case TEP_FILTER_EXP_MUL: return lval * rval; - case FILTER_EXP_DIV: + case TEP_FILTER_EXP_DIV: return lval / rval; - case FILTER_EXP_MOD: + case TEP_FILTER_EXP_MOD: return lval % rval; - case FILTER_EXP_RSHIFT: + case TEP_FILTER_EXP_RSHIFT: return lval >> rval; - case FILTER_EXP_LSHIFT: + case TEP_FILTER_EXP_LSHIFT: return lval << rval; - case FILTER_EXP_AND: + case TEP_FILTER_EXP_AND: return lval & rval; - case FILTER_EXP_OR: + case TEP_FILTER_EXP_OR: return lval | rval; - case FILTER_EXP_XOR: + case TEP_FILTER_EXP_XOR: return lval ^ rval; - case FILTER_EXP_NOT: + case TEP_FILTER_EXP_NOT: default: if (!*err) *err = TEP_ERRNO__INVALID_EXP_TYPE; @@ -1792,21 +1793,21 @@ get_exp_value(struct event_format *event, struct filter_arg *arg, } static unsigned long long -get_arg_value(struct event_format *event, struct filter_arg *arg, +get_arg_value(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { switch (arg->type) { - case FILTER_ARG_FIELD: + case TEP_FILTER_ARG_FIELD: return get_value(event, arg->field.field, record); - case FILTER_ARG_VALUE: - if (arg->value.type != FILTER_NUMBER) { + case TEP_FILTER_ARG_VALUE: + if (arg->value.type != TEP_FILTER_NUMBER) { if (!*err) *err = TEP_ERRNO__NOT_A_NUMBER; } return arg->value.val; - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: return get_exp_value(event, arg, record, err); default: @@ -1816,7 +1817,7 @@ get_arg_value(struct event_format *event, struct filter_arg *arg, return 0; } -static int test_num(struct event_format *event, struct filter_arg *arg, +static int test_num(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { unsigned long long lval, rval; @@ -1832,22 +1833,22 @@ static int test_num(struct event_format *event, struct filter_arg *arg, } switch (arg->num.type) { - case FILTER_CMP_EQ: + case TEP_FILTER_CMP_EQ: return lval == rval; - case FILTER_CMP_NE: + case TEP_FILTER_CMP_NE: return lval != rval; - case FILTER_CMP_GT: + case TEP_FILTER_CMP_GT: return lval > rval; - case FILTER_CMP_LT: + case TEP_FILTER_CMP_LT: return lval < rval; - case FILTER_CMP_GE: + case TEP_FILTER_CMP_GE: return lval >= rval; - case FILTER_CMP_LE: + case TEP_FILTER_CMP_LE: return lval <= rval; default: @@ -1857,9 +1858,9 @@ static int test_num(struct event_format *event, struct filter_arg *arg, } } -static const char *get_field_str(struct filter_arg *arg, struct tep_record *record) +static const char *get_field_str(struct tep_filter_arg *arg, struct tep_record *record) { - struct event_format *event; + struct tep_event_format *event; struct tep_handle *pevent; unsigned long long addr; const char *val = NULL; @@ -1867,11 +1868,11 @@ static const char *get_field_str(struct filter_arg *arg, struct tep_record *reco char hex[64]; /* If the field is not a string convert it */ - if (arg->str.field->flags & FIELD_IS_STRING) { + if (arg->str.field->flags & TEP_FIELD_IS_STRING) { val = record->data + arg->str.field->offset; size = arg->str.field->size; - if (arg->str.field->flags & FIELD_IS_DYNAMIC) { + if (arg->str.field->flags & TEP_FIELD_IS_DYNAMIC) { addr = *(unsigned int *)val; val = record->data + (addr & 0xffff); size = addr >> 16; @@ -1893,7 +1894,7 @@ static const char *get_field_str(struct filter_arg *arg, struct tep_record *reco pevent = event->pevent; addr = get_value(event, arg->str.field, record); - if (arg->str.field->flags & (FIELD_IS_POINTER | FIELD_IS_LONG)) + if (arg->str.field->flags & (TEP_FIELD_IS_POINTER | TEP_FIELD_IS_LONG)) /* convert to a kernel symbol */ val = tep_find_function(pevent, addr); @@ -1907,7 +1908,7 @@ static const char *get_field_str(struct filter_arg *arg, struct tep_record *reco return val; } -static int test_str(struct event_format *event, struct filter_arg *arg, +static int test_str(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { const char *val; @@ -1918,17 +1919,17 @@ static int test_str(struct event_format *event, struct filter_arg *arg, val = get_field_str(arg, record); switch (arg->str.type) { - case FILTER_CMP_MATCH: + case TEP_FILTER_CMP_MATCH: return strcmp(val, arg->str.val) == 0; - case FILTER_CMP_NOT_MATCH: + case TEP_FILTER_CMP_NOT_MATCH: return strcmp(val, arg->str.val) != 0; - case FILTER_CMP_REGEX: + case TEP_FILTER_CMP_REGEX: /* Returns zero on match */ return !regexec(&arg->str.reg, val, 0, NULL, 0); - case FILTER_CMP_NOT_REGEX: + case TEP_FILTER_CMP_NOT_REGEX: return regexec(&arg->str.reg, val, 0, NULL, 0); default: @@ -1938,19 +1939,19 @@ static int test_str(struct event_format *event, struct filter_arg *arg, } } -static int test_op(struct event_format *event, struct filter_arg *arg, +static int test_op(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { switch (arg->op.type) { - case FILTER_OP_AND: + case TEP_FILTER_OP_AND: return test_filter(event, arg->op.left, record, err) && test_filter(event, arg->op.right, record, err); - case FILTER_OP_OR: + case TEP_FILTER_OP_OR: return test_filter(event, arg->op.left, record, err) || test_filter(event, arg->op.right, record, err); - case FILTER_OP_NOT: + case TEP_FILTER_OP_NOT: return !test_filter(event, arg->op.right, record, err); default: @@ -1960,7 +1961,7 @@ static int test_op(struct event_format *event, struct filter_arg *arg, } } -static int test_filter(struct event_format *event, struct filter_arg *arg, +static int test_filter(struct tep_event_format *event, struct tep_filter_arg *arg, struct tep_record *record, enum tep_errno *err) { if (*err) { @@ -1971,22 +1972,22 @@ static int test_filter(struct event_format *event, struct filter_arg *arg, } switch (arg->type) { - case FILTER_ARG_BOOLEAN: + case TEP_FILTER_ARG_BOOLEAN: /* easy case */ return arg->boolean.value; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: return test_op(event, arg, record, err); - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: return test_num(event, arg, record, err); - case FILTER_ARG_STR: + case TEP_FILTER_ARG_STR: return test_str(event, arg, record, err); - case FILTER_ARG_EXP: - case FILTER_ARG_VALUE: - case FILTER_ARG_FIELD: + case TEP_FILTER_ARG_EXP: + case TEP_FILTER_ARG_VALUE: + case TEP_FILTER_ARG_FIELD: /* * Expressions, fields and values evaluate * to true if they return non zero @@ -2008,9 +2009,9 @@ static int test_filter(struct event_format *event, struct filter_arg *arg, * Returns 1 if filter found for @event_id * otherwise 0; */ -int tep_event_filtered(struct event_filter *filter, int event_id) +int tep_event_filtered(struct tep_event_filter *filter, int event_id) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; if (!filter->filters) return 0; @@ -2032,11 +2033,11 @@ int tep_event_filtered(struct event_filter *filter, int event_id) * NO_FILTER - if no filters exist * otherwise - error occurred during test */ -enum tep_errno tep_filter_match(struct event_filter *filter, +enum tep_errno tep_filter_match(struct tep_event_filter *filter, struct tep_record *record) { struct tep_handle *pevent = filter->pevent; - struct filter_type *filter_type; + struct tep_filter_type *filter_type; int event_id; int ret; enum tep_errno err = 0; @@ -2059,7 +2060,7 @@ enum tep_errno tep_filter_match(struct event_filter *filter, return ret ? TEP_ERRNO__FILTER_MATCH : TEP_ERRNO__FILTER_MISS; } -static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *op_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *str = NULL; char *left = NULL; @@ -2070,10 +2071,10 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) int val; switch (arg->op.type) { - case FILTER_OP_AND: + case TEP_FILTER_OP_AND: op = "&&"; /* fall through */ - case FILTER_OP_OR: + case TEP_FILTER_OP_OR: if (!op) op = "||"; @@ -2094,8 +2095,8 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) right_val = 0; if (left_val >= 0) { - if ((arg->op.type == FILTER_OP_AND && !left_val) || - (arg->op.type == FILTER_OP_OR && left_val)) { + if ((arg->op.type == TEP_FILTER_OP_AND && !left_val) || + (arg->op.type == TEP_FILTER_OP_OR && left_val)) { /* Just return left value */ str = left; left = NULL; @@ -2105,10 +2106,10 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) /* just evaluate this. */ val = 0; switch (arg->op.type) { - case FILTER_OP_AND: + case TEP_FILTER_OP_AND: val = left_val && right_val; break; - case FILTER_OP_OR: + case TEP_FILTER_OP_OR: val = left_val || right_val; break; default: @@ -2119,8 +2120,8 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) } } if (right_val >= 0) { - if ((arg->op.type == FILTER_OP_AND && !right_val) || - (arg->op.type == FILTER_OP_OR && right_val)) { + if ((arg->op.type == TEP_FILTER_OP_AND && !right_val) || + (arg->op.type == TEP_FILTER_OP_OR && right_val)) { /* Just return right value */ str = right; right = NULL; @@ -2135,7 +2136,7 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) asprintf(&str, "(%s) %s (%s)", left, op, right); break; - case FILTER_OP_NOT: + case TEP_FILTER_OP_NOT: op = "!"; right = arg_to_str(filter, arg->op.right); if (!right) @@ -2163,7 +2164,7 @@ static char *op_to_str(struct event_filter *filter, struct filter_arg *arg) return str; } -static char *val_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *val_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *str = NULL; @@ -2172,12 +2173,12 @@ static char *val_to_str(struct event_filter *filter, struct filter_arg *arg) return str; } -static char *field_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *field_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { return strdup(arg->field.field->name); } -static char *exp_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *exp_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *lstr; char *rstr; @@ -2190,34 +2191,34 @@ static char *exp_to_str(struct event_filter *filter, struct filter_arg *arg) goto out; switch (arg->exp.type) { - case FILTER_EXP_ADD: + case TEP_FILTER_EXP_ADD: op = "+"; break; - case FILTER_EXP_SUB: + case TEP_FILTER_EXP_SUB: op = "-"; break; - case FILTER_EXP_MUL: + case TEP_FILTER_EXP_MUL: op = "*"; break; - case FILTER_EXP_DIV: + case TEP_FILTER_EXP_DIV: op = "/"; break; - case FILTER_EXP_MOD: + case TEP_FILTER_EXP_MOD: op = "%"; break; - case FILTER_EXP_RSHIFT: + case TEP_FILTER_EXP_RSHIFT: op = ">>"; break; - case FILTER_EXP_LSHIFT: + case TEP_FILTER_EXP_LSHIFT: op = "<<"; break; - case FILTER_EXP_AND: + case TEP_FILTER_EXP_AND: op = "&"; break; - case FILTER_EXP_OR: + case TEP_FILTER_EXP_OR: op = "|"; break; - case FILTER_EXP_XOR: + case TEP_FILTER_EXP_XOR: op = "^"; break; default: @@ -2233,7 +2234,7 @@ out: return str; } -static char *num_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *num_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *lstr; char *rstr; @@ -2246,26 +2247,26 @@ static char *num_to_str(struct event_filter *filter, struct filter_arg *arg) goto out; switch (arg->num.type) { - case FILTER_CMP_EQ: + case TEP_FILTER_CMP_EQ: op = "=="; /* fall through */ - case FILTER_CMP_NE: + case TEP_FILTER_CMP_NE: if (!op) op = "!="; /* fall through */ - case FILTER_CMP_GT: + case TEP_FILTER_CMP_GT: if (!op) op = ">"; /* fall through */ - case FILTER_CMP_LT: + case TEP_FILTER_CMP_LT: if (!op) op = "<"; /* fall through */ - case FILTER_CMP_GE: + case TEP_FILTER_CMP_GE: if (!op) op = ">="; /* fall through */ - case FILTER_CMP_LE: + case TEP_FILTER_CMP_LE: if (!op) op = "<="; @@ -2283,24 +2284,24 @@ out: return str; } -static char *str_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *str_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *str = NULL; char *op = NULL; switch (arg->str.type) { - case FILTER_CMP_MATCH: + case TEP_FILTER_CMP_MATCH: op = "=="; /* fall through */ - case FILTER_CMP_NOT_MATCH: + case TEP_FILTER_CMP_NOT_MATCH: if (!op) op = "!="; /* fall through */ - case FILTER_CMP_REGEX: + case TEP_FILTER_CMP_REGEX: if (!op) op = "=~"; /* fall through */ - case FILTER_CMP_NOT_REGEX: + case TEP_FILTER_CMP_NOT_REGEX: if (!op) op = "!~"; @@ -2315,31 +2316,31 @@ static char *str_to_str(struct event_filter *filter, struct filter_arg *arg) return str; } -static char *arg_to_str(struct event_filter *filter, struct filter_arg *arg) +static char *arg_to_str(struct tep_event_filter *filter, struct tep_filter_arg *arg) { char *str = NULL; switch (arg->type) { - case FILTER_ARG_BOOLEAN: + case TEP_FILTER_ARG_BOOLEAN: asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE"); return str; - case FILTER_ARG_OP: + case TEP_FILTER_ARG_OP: return op_to_str(filter, arg); - case FILTER_ARG_NUM: + case TEP_FILTER_ARG_NUM: return num_to_str(filter, arg); - case FILTER_ARG_STR: + case TEP_FILTER_ARG_STR: return str_to_str(filter, arg); - case FILTER_ARG_VALUE: + case TEP_FILTER_ARG_VALUE: return val_to_str(filter, arg); - case FILTER_ARG_FIELD: + case TEP_FILTER_ARG_FIELD: return field_to_str(filter, arg); - case FILTER_ARG_EXP: + case TEP_FILTER_ARG_EXP: return exp_to_str(filter, arg); default: @@ -2359,9 +2360,9 @@ static char *arg_to_str(struct event_filter *filter, struct filter_arg *arg) * NULL is returned if no filter is found or allocation failed. */ char * -tep_filter_make_string(struct event_filter *filter, int event_id) +tep_filter_make_string(struct tep_event_filter *filter, int event_id) { - struct filter_type *filter_type; + struct tep_filter_type *filter_type; if (!filter->filters) return NULL; @@ -2383,10 +2384,10 @@ tep_filter_make_string(struct event_filter *filter, int event_id) * 1 if the two filters hold the same content. * 0 if they do not. */ -int tep_filter_compare(struct event_filter *filter1, struct event_filter *filter2) +int tep_filter_compare(struct tep_event_filter *filter1, struct tep_event_filter *filter2) { - struct filter_type *filter_type1; - struct filter_type *filter_type2; + struct tep_filter_type *filter_type1; + struct tep_filter_type *filter_type2; char *str1, *str2; int result; int i; @@ -2409,8 +2410,8 @@ int tep_filter_compare(struct event_filter *filter1, struct event_filter *filter if (filter_type1->filter->type != filter_type2->filter->type) break; switch (filter_type1->filter->type) { - case FILTER_TRIVIAL_FALSE: - case FILTER_TRIVIAL_TRUE: + case TEP_FILTER_TRIVIAL_FALSE: + case TEP_FILTER_TRIVIAL_TRUE: /* trivial types just need the type compared */ continue; default: diff --git a/tools/lib/traceevent/plugin_function.c b/tools/lib/traceevent/plugin_function.c index 424747475d37..528acc75d81a 100644 --- a/tools/lib/traceevent/plugin_function.c +++ b/tools/lib/traceevent/plugin_function.c @@ -23,6 +23,7 @@ #include "event-parse.h" #include "event-utils.h" +#include "trace-seq.h" static struct func_stack { int size; @@ -123,7 +124,7 @@ static int add_and_get_index(const char *parent, const char *child, int cpu) } static int function_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { struct tep_handle *pevent = event->pevent; unsigned long long function; diff --git a/tools/lib/traceevent/plugin_hrtimer.c b/tools/lib/traceevent/plugin_hrtimer.c index b43bfec565d8..9aa05b4ca811 100644 --- a/tools/lib/traceevent/plugin_hrtimer.c +++ b/tools/lib/traceevent/plugin_hrtimer.c @@ -23,10 +23,11 @@ #include #include "event-parse.h" +#include "trace-seq.h" static int timer_expire_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { trace_seq_printf(s, "hrtimer="); @@ -46,7 +47,7 @@ static int timer_expire_handler(struct trace_seq *s, static int timer_start_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { trace_seq_printf(s, "hrtimer="); diff --git a/tools/lib/traceevent/plugin_jbd2.c b/tools/lib/traceevent/plugin_jbd2.c index 45a9acd19640..a5e34135dd6a 100644 --- a/tools/lib/traceevent/plugin_jbd2.c +++ b/tools/lib/traceevent/plugin_jbd2.c @@ -22,6 +22,7 @@ #include #include "event-parse.h" +#include "trace-seq.h" #define MINORBITS 20 #define MINORMASK ((1U << MINORBITS) - 1) diff --git a/tools/lib/traceevent/plugin_kmem.c b/tools/lib/traceevent/plugin_kmem.c index 73966b05abce..1beb4eaddfdf 100644 --- a/tools/lib/traceevent/plugin_kmem.c +++ b/tools/lib/traceevent/plugin_kmem.c @@ -22,11 +22,12 @@ #include #include "event-parse.h" +#include "trace-seq.h" static int call_site_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { - struct format_field *field; + struct tep_format_field *field; unsigned long long val, addr; void *data = record->data; const char *func; diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c index 1d0d15906225..d13c22846fa9 100644 --- a/tools/lib/traceevent/plugin_kvm.c +++ b/tools/lib/traceevent/plugin_kvm.c @@ -23,6 +23,7 @@ #include #include "event-parse.h" +#include "trace-seq.h" #ifdef HAVE_UDIS86 @@ -248,7 +249,7 @@ static const char *find_exit_reason(unsigned isa, int val) } static int print_exit_reason(struct trace_seq *s, struct tep_record *record, - struct event_format *event, const char *field) + struct tep_event_format *event, const char *field) { unsigned long long isa; unsigned long long val; @@ -269,7 +270,7 @@ static int print_exit_reason(struct trace_seq *s, struct tep_record *record, } static int kvm_exit_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { unsigned long long info1 = 0, info2 = 0; @@ -292,7 +293,7 @@ static int kvm_exit_handler(struct trace_seq *s, struct tep_record *record, static int kvm_emulate_insn_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { unsigned long long rip, csbase, len, flags, failed; int llen; @@ -331,7 +332,7 @@ static int kvm_emulate_insn_handler(struct trace_seq *s, static int kvm_nested_vmexit_inject_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { if (print_exit_reason(s, record, event, "exit_code") < 0) return -1; @@ -345,7 +346,7 @@ static int kvm_nested_vmexit_inject_handler(struct trace_seq *s, struct tep_reco } static int kvm_nested_vmexit_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { tep_print_num_field(s, "rip %llx ", event, "rip", record, 1); @@ -371,7 +372,7 @@ union kvm_mmu_page_role { }; static int kvm_mmu_print_role(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { unsigned long long val; static const char *access_str[] = { @@ -418,7 +419,7 @@ static int kvm_mmu_print_role(struct trace_seq *s, struct tep_record *record, static int kvm_mmu_get_page_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { unsigned long long val; diff --git a/tools/lib/traceevent/plugin_mac80211.c b/tools/lib/traceevent/plugin_mac80211.c index de50a5316203..da3855e7b86f 100644 --- a/tools/lib/traceevent/plugin_mac80211.c +++ b/tools/lib/traceevent/plugin_mac80211.c @@ -22,13 +22,14 @@ #include #include "event-parse.h" +#include "trace-seq.h" #define INDENT 65 -static void print_string(struct trace_seq *s, struct event_format *event, +static void print_string(struct trace_seq *s, struct tep_event_format *event, const char *name, const void *data) { - struct format_field *f = tep_find_field(event, name); + struct tep_format_field *f = tep_find_field(event, name); int offset; int length; @@ -59,7 +60,7 @@ static void print_string(struct trace_seq *s, struct event_format *event, static int drv_bss_info_changed(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { void *data = record->data; diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c index eecb4bd95c11..77882272672f 100644 --- a/tools/lib/traceevent/plugin_sched_switch.c +++ b/tools/lib/traceevent/plugin_sched_switch.c @@ -22,6 +22,7 @@ #include #include "event-parse.h" +#include "trace-seq.h" static void write_state(struct trace_seq *s, int val) { @@ -44,7 +45,7 @@ static void write_state(struct trace_seq *s, int val) trace_seq_putc(s, 'R'); } -static void write_and_save_comm(struct format_field *field, +static void write_and_save_comm(struct tep_format_field *field, struct tep_record *record, struct trace_seq *s, int pid) { @@ -66,9 +67,9 @@ static void write_and_save_comm(struct format_field *field, static int sched_wakeup_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { - struct format_field *field; + struct tep_format_field *field; unsigned long long val; if (tep_get_field_val(s, event, "pid", record, &val, 1)) @@ -95,9 +96,9 @@ static int sched_wakeup_handler(struct trace_seq *s, static int sched_switch_handler(struct trace_seq *s, struct tep_record *record, - struct event_format *event, void *context) + struct tep_event_format *event, void *context) { - struct format_field *field; + struct tep_format_field *field; unsigned long long val; if (tep_get_field_val(s, event, "prev_pid", record, &val, 1)) diff --git a/tools/lib/traceevent/plugin_scsi.c b/tools/lib/traceevent/plugin_scsi.c index 5ec346f6b842..4eba25cc1431 100644 --- a/tools/lib/traceevent/plugin_scsi.c +++ b/tools/lib/traceevent/plugin_scsi.c @@ -3,6 +3,7 @@ #include #include #include "event-parse.h" +#include "trace-seq.h" typedef unsigned long sector_t; typedef uint64_t u64; diff --git a/tools/lib/traceevent/plugin_xen.c b/tools/lib/traceevent/plugin_xen.c index b2acbd6e9c86..bc0496e4c296 100644 --- a/tools/lib/traceevent/plugin_xen.c +++ b/tools/lib/traceevent/plugin_xen.c @@ -3,6 +3,7 @@ #include #include #include "event-parse.h" +#include "trace-seq.h" #define __HYPERVISOR_set_trap_table 0 #define __HYPERVISOR_mmu_update 1 diff --git a/tools/lib/traceevent/tep_strerror.c b/tools/lib/traceevent/tep_strerror.c new file mode 100644 index 000000000000..4ac26445b2f6 --- /dev/null +++ b/tools/lib/traceevent/tep_strerror.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: LGPL-2.1 +#undef _GNU_SOURCE +#include +#include + +#include "event-parse.h" + +#undef _PE +#define _PE(code, str) str +static const char * const tep_error_str[] = { + TEP_ERRORS +}; +#undef _PE + +/* + * The tools so far have been using the strerror_r() GNU variant, that returns + * a string, be it the buffer passed or something else. + * + * But that, besides being tricky in cases where we expect that the function + * using strerror_r() returns the error formatted in a provided buffer (we have + * to check if it returned something else and copy that instead), breaks the + * build on systems not using glibc, like Alpine Linux, where musl libc is + * used. + * + * So, introduce yet another wrapper, str_error_r(), that has the GNU + * interface, but uses the portable XSI variant of strerror_r(), so that users + * rest asured that the provided buffer is used and it is what is returned. + */ +int tep_strerror(struct tep_handle *tep __maybe_unused, + enum tep_errno errnum, char *buf, size_t buflen) +{ + const char *msg; + int idx; + + if (!buflen) + return 0; + + if (errnum >= 0) { + int err = strerror_r(errnum, buf, buflen); + buf[buflen - 1] = 0; + return err; + } + + if (errnum <= __TEP_ERRNO__START || + errnum >= __TEP_ERRNO__END) + return -1; + + idx = errnum - __TEP_ERRNO__START - 1; + msg = tep_error_str[idx]; + snprintf(buf, buflen, "%s", msg); + + return 0; +} diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c index e3bac4543d3b..8ff1d55954d1 100644 --- a/tools/lib/traceevent/trace-seq.c +++ b/tools/lib/traceevent/trace-seq.c @@ -3,6 +3,8 @@ * Copyright (C) 2009 Red Hat Inc, Steven Rostedt * */ +#include "trace-seq.h" + #include #include #include diff --git a/tools/lib/traceevent/trace-seq.h b/tools/lib/traceevent/trace-seq.h new file mode 100644 index 000000000000..d68ec69f8d1a --- /dev/null +++ b/tools/lib/traceevent/trace-seq.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt + * + */ + +#ifndef _TRACE_SEQ_H +#define _TRACE_SEQ_H + +#include +#include + +/* ----------------------- trace_seq ----------------------- */ + +#ifndef TRACE_SEQ_BUF_SIZE +#define TRACE_SEQ_BUF_SIZE 4096 +#endif + +enum trace_seq_fail { + TRACE_SEQ__GOOD, + TRACE_SEQ__BUFFER_POISONED, + TRACE_SEQ__MEM_ALLOC_FAILED, +}; + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use (up to a max of PAGE_SIZE). + */ + +struct trace_seq { + char *buffer; + unsigned int buffer_size; + unsigned int len; + unsigned int readpos; + enum trace_seq_fail state; +}; + +void trace_seq_init(struct trace_seq *s); +void trace_seq_reset(struct trace_seq *s); +void trace_seq_destroy(struct trace_seq *s); + +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); + +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); + +extern void trace_seq_terminate(struct trace_seq *s); + +extern int trace_seq_do_fprintf(struct trace_seq *s, FILE *fp); +extern int trace_seq_do_printf(struct trace_seq *s); + +#endif /* _TRACE_SEQ_H */ diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index 0cbd1ef8f86d..35bff92cc773 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -28,7 +28,8 @@ Explanation of the Linux-Kernel Memory Consistency Model 20. THE HAPPENS-BEFORE RELATION: hb 21. THE PROPAGATES-BEFORE RELATION: pb 22. RCU RELATIONS: rcu-link, gp, rscs, rcu-fence, and rb - 23. ODDS AND ENDS + 23. LOCKING + 24. ODDS AND ENDS @@ -1067,28 +1068,6 @@ allowing out-of-order writes like this to occur. The model avoided violating the write-write coherence rule by requiring the CPU not to send the W write to the memory subsystem at all!) -There is one last example of preserved program order in the LKMM: when -a load-acquire reads from an earlier store-release. For example: - - smp_store_release(&x, 123); - r1 = smp_load_acquire(&x); - -If the smp_load_acquire() ends up obtaining the 123 value that was -stored by the smp_store_release(), the LKMM says that the load must be -executed after the store; the store cannot be forwarded to the load. -This requirement does not arise from the operational model, but it -yields correct predictions on all architectures supported by the Linux -kernel, although for differing reasons. - -On some architectures, including x86 and ARMv8, it is true that the -store cannot be forwarded to the load. On others, including PowerPC -and ARMv7, smp_store_release() generates object code that starts with -a fence and smp_load_acquire() generates object code that ends with a -fence. The upshot is that even though the store may be forwarded to -the load, it is still true that any instruction preceding the store -will be executed before the load or any following instructions, and -the store will be executed before any instruction following the load. - AND THEN THERE WAS ALPHA ------------------------ @@ -1766,6 +1745,147 @@ before it does, and the critical section in P2 both starts after P1's grace period does and ends after it does. +LOCKING +------- + +The LKMM includes locking. In fact, there is special code for locking +in the formal model, added in order to make tools run faster. +However, this special code is intended to be more or less equivalent +to concepts we have already covered. A spinlock_t variable is treated +the same as an int, and spin_lock(&s) is treated almost the same as: + + while (cmpxchg_acquire(&s, 0, 1) != 0) + cpu_relax(); + +This waits until s is equal to 0 and then atomically sets it to 1, +and the read part of the cmpxchg operation acts as an acquire fence. +An alternate way to express the same thing would be: + + r = xchg_acquire(&s, 1); + +along with a requirement that at the end, r = 0. Similarly, +spin_trylock(&s) is treated almost the same as: + + return !cmpxchg_acquire(&s, 0, 1); + +which atomically sets s to 1 if it is currently equal to 0 and returns +true if it succeeds (the read part of the cmpxchg operation acts as an +acquire fence only if the operation is successful). spin_unlock(&s) +is treated almost the same as: + + smp_store_release(&s, 0); + +The "almost" qualifiers above need some explanation. In the LKMM, the +store-release in a spin_unlock() and the load-acquire which forms the +first half of the atomic rmw update in a spin_lock() or a successful +spin_trylock() -- we can call these things lock-releases and +lock-acquires -- have two properties beyond those of ordinary releases +and acquires. + +First, when a lock-acquire reads from a lock-release, the LKMM +requires that every instruction po-before the lock-release must +execute before any instruction po-after the lock-acquire. This would +naturally hold if the release and acquire operations were on different +CPUs, but the LKMM says it holds even when they are on the same CPU. +For example: + + int x, y; + spinlock_t s; + + P0() + { + int r1, r2; + + spin_lock(&s); + r1 = READ_ONCE(x); + spin_unlock(&s); + spin_lock(&s); + r2 = READ_ONCE(y); + spin_unlock(&s); + } + + P1() + { + WRITE_ONCE(y, 1); + smp_wmb(); + WRITE_ONCE(x, 1); + } + +Here the second spin_lock() reads from the first spin_unlock(), and +therefore the load of x must execute before the load of y. Thus we +cannot have r1 = 1 and r2 = 0 at the end (this is an instance of the +MP pattern). + +This requirement does not apply to ordinary release and acquire +fences, only to lock-related operations. For instance, suppose P0() +in the example had been written as: + + P0() + { + int r1, r2, r3; + + r1 = READ_ONCE(x); + smp_store_release(&s, 1); + r3 = smp_load_acquire(&s); + r2 = READ_ONCE(y); + } + +Then the CPU would be allowed to forward the s = 1 value from the +smp_store_release() to the smp_load_acquire(), executing the +instructions in the following order: + + r3 = smp_load_acquire(&s); // Obtains r3 = 1 + r2 = READ_ONCE(y); + r1 = READ_ONCE(x); + smp_store_release(&s, 1); // Value is forwarded + +and thus it could load y before x, obtaining r2 = 0 and r1 = 1. + +Second, when a lock-acquire reads from a lock-release, and some other +stores W and W' occur po-before the lock-release and po-after the +lock-acquire respectively, the LKMM requires that W must propagate to +each CPU before W' does. For example, consider: + + int x, y; + spinlock_t x; + + P0() + { + spin_lock(&s); + WRITE_ONCE(x, 1); + spin_unlock(&s); + } + + P1() + { + int r1; + + spin_lock(&s); + r1 = READ_ONCE(x); + WRITE_ONCE(y, 1); + spin_unlock(&s); + } + + P2() + { + int r2, r3; + + r2 = READ_ONCE(y); + smp_rmb(); + r3 = READ_ONCE(x); + } + +If r1 = 1 at the end then the spin_lock() in P1 must have read from +the spin_unlock() in P0. Hence the store to x must propagate to P2 +before the store to y does, so we cannot have r2 = 1 and r3 = 0. + +These two special requirements for lock-release and lock-acquire do +not arise from the operational model. Nevertheless, kernel developers +have come to expect and rely on them because they do hold on all +architectures supported by the Linux kernel, albeit for various +differing reasons. + + ODDS AND ENDS ------------- @@ -1831,26 +1951,6 @@ they behave as follows: events and the events preceding them against all po-later events. -The LKMM includes locking. In fact, there is special code for locking -in the formal model, added in order to make tools run faster. -However, this special code is intended to be exactly equivalent to -concepts we have already covered. A spinlock_t variable is treated -the same as an int, and spin_lock(&s) is treated the same as: - - while (cmpxchg_acquire(&s, 0, 1) != 0) - cpu_relax(); - -which waits until s is equal to 0 and then atomically sets it to 1, -and where the read part of the atomic update is also an acquire fence. -An alternate way to express the same thing would be: - - r = xchg_acquire(&s, 1); - -along with a requirement that at the end, r = 0. spin_unlock(&s) is -treated the same as: - - smp_store_release(&s, 0); - Interestingly, RCU and locking each introduce the possibility of deadlock. When faced with code sequences such as: diff --git a/tools/memory-model/Documentation/recipes.txt b/tools/memory-model/Documentation/recipes.txt index af72700cc20a..7fe8d7aa3029 100644 --- a/tools/memory-model/Documentation/recipes.txt +++ b/tools/memory-model/Documentation/recipes.txt @@ -311,7 +311,7 @@ The smp_wmb() macro orders prior stores against later stores, and the smp_rmb() macro orders prior loads against later loads. Therefore, if the final value of r0 is 1, the final value of r1 must also be 1. -The the xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains +The xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains the following write-side code fragment: log->l_curr_block -= log->l_logBBsize; diff --git a/tools/memory-model/README b/tools/memory-model/README index ee987ce20aae..acf9077cffaa 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -171,6 +171,12 @@ The Linux-kernel memory model has the following limitations: particular, the "THE PROGRAM ORDER RELATION: po AND po-loc" and "A WARNING" sections). + Note that this limitation in turn limits LKMM's ability to + accurately model address, control, and data dependencies. + For example, if the compiler can deduce the value of some variable + carrying a dependency, then the compiler can break that dependency + by substituting a constant of that value. + 2. Multiple access sizes for a single variable are not supported, and neither are misaligned or partially overlapping accesses. @@ -190,6 +196,36 @@ The Linux-kernel memory model has the following limitations: However, a substantial amount of support is provided for these operations, as shown in the linux-kernel.def file. + a. When rcu_assign_pointer() is passed NULL, the Linux + kernel provides no ordering, but LKMM models this + case as a store release. + + b. The "unless" RMW operations are not currently modeled: + atomic_long_add_unless(), atomic_add_unless(), + atomic_inc_unless_negative(), and + atomic_dec_unless_positive(). These can be emulated + in litmus tests, for example, by using atomic_cmpxchg(). + + c. The call_rcu() function is not modeled. It can be + emulated in litmus tests by adding another process that + invokes synchronize_rcu() and the body of the callback + function, with (for example) a release-acquire from + the site of the emulated call_rcu() to the beginning + of the additional process. + + d. The rcu_barrier() function is not modeled. It can be + emulated in litmus tests emulating call_rcu() via + (for example) a release-acquire from the end of each + additional call_rcu() process to the site of the + emulated rcu-barrier(). + + e. Sleepable RCU (SRCU) is not modeled. It can be + emulated, but perhaps not simply. + + f. Reader-writer locking is not modeled. It can be + emulated in litmus tests using atomic read-modify-write + operations. + The "herd7" tool has some additional limitations of its own, apart from the memory model: @@ -204,3 +240,6 @@ the memory model: Some of these limitations may be overcome in the future, but others are more likely to be addressed by incorporating the Linux-kernel memory model into other tools. + +Finally, please note that LKMM is subject to change as hardware, use cases, +and compilers evolve. diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 59b5cbe6b624..882fc33274ac 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -38,7 +38,7 @@ let strong-fence = mb | gp (* Release Acquire *) let acq-po = [Acquire] ; po ; [M] let po-rel = [M] ; po ; [Release] -let rfi-rel-acq = [Release] ; rfi ; [Acquire] +let po-unlock-rf-lock-po = po ; [UL] ; rf ; [LKR] ; po (**********************************) (* Fundamental coherence ordering *) @@ -60,13 +60,13 @@ let dep = addr | data let rwdep = (dep | ctrl) ; [W] let overwrite = co | fr let to-w = rwdep | (overwrite & int) -let to-r = addr | (dep ; rfi) | rfi-rel-acq +let to-r = addr | (dep ; rfi) let fence = strong-fence | wmb | po-rel | rmb | acq-po -let ppo = to-r | to-w | fence +let ppo = to-r | to-w | fence | (po-unlock-rf-lock-po & int) (* Propagation: Ordering from release operations and strong fences. *) let A-cumul(r) = rfe? ; r -let cumul-fence = A-cumul(strong-fence | po-rel) | wmb +let cumul-fence = A-cumul(strong-fence | po-rel) | wmb | po-unlock-rf-lock-po let prop = (overwrite & ext)? ; cumul-fence* ; rfe? (* diff --git a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus index 0f749e419b34..094d58df7789 100644 --- a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus @@ -1,11 +1,10 @@ C ISA2+pooncelock+pooncelock+pombonce (* - * Result: Sometimes + * Result: Never * - * This test shows that the ordering provided by a lock-protected S - * litmus test (P0() and P1()) are not visible to external process P2(). - * This is likely to change soon. + * This test shows that write-write ordering provided by locks + * (in P0() and P1()) is visible to external process P2(). *) {} diff --git a/tools/memory-model/litmus-tests/README b/tools/memory-model/litmus-tests/README index 4581ec2d3c57..5ee08f129094 100644 --- a/tools/memory-model/litmus-tests/README +++ b/tools/memory-model/litmus-tests/README @@ -1,4 +1,6 @@ -This directory contains the following litmus tests: +============ +LITMUS TESTS +============ CoRR+poonceonce+Once.litmus Test of read-read coherence, that is, whether or not two @@ -36,7 +38,7 @@ IRIW+poonceonces+OnceOnce.litmus ISA2+pooncelock+pooncelock+pombonce.litmus Tests whether the ordering provided by a lock-protected S litmus test is visible to an external process whose accesses are - separated by smp_mb(). This addition of an external process to + separated by smp_mb(). This addition of an external process to S is otherwise known as ISA2. ISA2+poonceonces.litmus @@ -151,3 +153,101 @@ Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus A great many more litmus tests are available here: https://github.com/paulmckrcu/litmus + +================== +LITMUS TEST NAMING +================== + +Litmus tests are usually named based on their contents, which means that +looking at the name tells you what the litmus test does. The naming +scheme covers litmus tests having a single cycle that passes through +each process exactly once, so litmus tests not fitting this description +are named on an ad-hoc basis. + +The structure of a litmus-test name is the litmus-test class, a plus +sign ("+"), and one string for each process, separated by plus signs. +The end of the name is ".litmus". + +The litmus-test classes may be found in the infamous test6.pdf: +https://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test6.pdf +Each class defines the pattern of accesses and of the variables accessed. +For example, if the one process writes to a pair of variables, and +the other process reads from these same variables, the corresponding +litmus-test class is "MP" (message passing), which may be found on the +left-hand end of the second row of tests on page one of test6.pdf. + +The strings used to identify the actions carried out by each process are +complex due to a desire to have short(er) names. Thus, there is a tool to +generate these strings from a given litmus test's actions. For example, +consider the processes from SB+rfionceonce-poonceonces.litmus: + + P0(int *x, int *y) + { + int r1; + int r2; + + WRITE_ONCE(*x, 1); + r1 = READ_ONCE(*x); + r2 = READ_ONCE(*y); + } + + P1(int *x, int *y) + { + int r3; + int r4; + + WRITE_ONCE(*y, 1); + r3 = READ_ONCE(*y); + r4 = READ_ONCE(*x); + } + +The next step is to construct a space-separated list of descriptors, +interleaving descriptions of the relation between a pair of consecutive +accesses with descriptions of the second access in the pair. + +P0()'s WRITE_ONCE() is read by its first READ_ONCE(), which is a +reads-from link (rf) and internal to the P0() process. This is +"rfi", which is an abbreviation for "reads-from internal". Because +some of the tools string these abbreviations together with space +characters separating processes, the first character is capitalized, +resulting in "Rfi". + +P0()'s second access is a READ_ONCE(), as opposed to (for example) +smp_load_acquire(), so next is "Once". Thus far, we have "Rfi Once". + +P0()'s third access is also a READ_ONCE(), but to y rather than x. +This is related to P0()'s second access by program order ("po"), +to a different variable ("d"), and both accesses are reads ("RR"). +The resulting descriptor is "PodRR". Because P0()'s third access is +READ_ONCE(), we add another "Once" descriptor. + +A from-read ("fre") relation links P0()'s third to P1()'s first +access, and the resulting descriptor is "Fre". P1()'s first access is +WRITE_ONCE(), which as before gives the descriptor "Once". The string +thus far is thus "Rfi Once PodRR Once Fre Once". + +The remainder of P1() is similar to P0(), which means we add +"Rfi Once PodRR Once". Another fre links P1()'s last access to +P0()'s first access, which is WRITE_ONCE(), so we add "Fre Once". +The full string is thus: + + Rfi Once PodRR Once Fre Once Rfi Once PodRR Once Fre Once + +This string can be given to the "norm7" and "classify7" tools to +produce the name: + + $ norm7 -bell linux-kernel.bell \ + Rfi Once PodRR Once Fre Once Rfi Once PodRR Once Fre Once | \ + sed -e 's/:.*//g' + SB+rfionceonce-poonceonces + +Adding the ".litmus" suffix: SB+rfionceonce-poonceonces.litmus + +The descriptors that describe connections between consecutive accesses +within the cycle through a given litmus test can be provided by the herd +tool (Rfi, Po, Fre, and so on) or by the linux-kernel.bell file (Once, +Release, Acquire, and so on). + +To see the full list of descriptors, execute the following command: + + $ diyone7 -bell linux-kernel.bell -show edges diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2928939b98ec..0414a0d52262 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -836,7 +836,7 @@ static int add_switch_table(struct objtool_file *file, struct instruction *insn, struct symbol *pfunc = insn->func->pfunc; unsigned int prev_offset = 0; - list_for_each_entry_from(rela, &file->rodata->rela->rela_list, list) { + list_for_each_entry_from(rela, &table->rela_sec->rela_list, list) { if (rela == next_table) break; @@ -926,6 +926,7 @@ static struct rela *find_switch_table(struct objtool_file *file, { struct rela *text_rela, *rodata_rela; struct instruction *orig_insn = insn; + struct section *rodata_sec; unsigned long table_offset; /* @@ -953,10 +954,13 @@ static struct rela *find_switch_table(struct objtool_file *file, /* look for a relocation which references .rodata */ text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len); - if (!text_rela || text_rela->sym != file->rodata->sym) + if (!text_rela || text_rela->sym->type != STT_SECTION || + !text_rela->sym->sec->rodata) continue; table_offset = text_rela->addend; + rodata_sec = text_rela->sym->sec; + if (text_rela->type == R_X86_64_PC32) table_offset += 4; @@ -964,10 +968,10 @@ static struct rela *find_switch_table(struct objtool_file *file, * Make sure the .rodata address isn't associated with a * symbol. gcc jump tables are anonymous data. */ - if (find_symbol_containing(file->rodata, table_offset)) + if (find_symbol_containing(rodata_sec, table_offset)) continue; - rodata_rela = find_rela_by_dest(file->rodata, table_offset); + rodata_rela = find_rela_by_dest(rodata_sec, table_offset); if (rodata_rela) { /* * Use of RIP-relative switch jumps is quite rare, and @@ -1052,7 +1056,7 @@ static int add_switch_table_alts(struct objtool_file *file) struct symbol *func; int ret; - if (!file->rodata || !file->rodata->rela) + if (!file->rodata) return 0; for_each_sec(file, sec) { @@ -1198,10 +1202,33 @@ static int read_retpoline_hints(struct objtool_file *file) return 0; } +static void mark_rodata(struct objtool_file *file) +{ + struct section *sec; + bool found = false; + + /* + * This searches for the .rodata section or multiple .rodata.func_name + * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8 + * rodata sections are ignored as they don't contain jump tables. + */ + for_each_sec(file, sec) { + if (!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) { + sec->rodata = true; + found = true; + } + } + + file->rodata = found; +} + static int decode_sections(struct objtool_file *file) { int ret; + mark_rodata(file); + ret = decode_instructions(file); if (ret) return ret; @@ -2171,7 +2198,6 @@ int check(const char *_objname, bool orc) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); - file.rodata = find_section_by_name(file.elf, ".rodata"); file.c_file = find_section_by_name(file.elf, ".comment"); file.ignore_unreachables = no_unreachable; file.hints = false; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 95700a2bcb7c..e6e8a655b556 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -60,8 +60,8 @@ struct objtool_file { struct elf *elf; struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 16); - struct section *rodata, *whitelist; - bool ignore_unreachables, c_file, hints; + struct section *whitelist; + bool ignore_unreachables, c_file, hints, rodata; }; int check(const char *objname, bool orc); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7ec85d567598..6dbb9fae0f9d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -301,7 +301,7 @@ static int read_symbols(struct elf *elf) if (sym->type != STT_FUNC) continue; sym->pfunc = sym->cfunc = sym; - coldstr = strstr(sym->name, ".cold."); + coldstr = strstr(sym->name, ".cold"); if (!coldstr) continue; @@ -379,6 +379,7 @@ static int read_relas(struct elf *elf) rela->offset = rela->rela.r_offset; symndx = GELF_R_SYM(rela->rela.r_info); rela->sym = find_symbol_by_index(elf, symndx); + rela->rela_sec = sec; if (!rela->sym) { WARN("can't find rela entry symbol %d for %s", symndx, sec->name); diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index de5cd2ddded9..bc97ed86b9cd 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -48,7 +48,7 @@ struct section { char *name; int idx; unsigned int len; - bool changed, text; + bool changed, text, rodata; }; struct symbol { @@ -68,6 +68,7 @@ struct rela { struct list_head list; struct hlist_node hash; GElf_Rela rela; + struct section *rela_sec; struct symbol *sym; unsigned int type; unsigned long offset; diff --git a/tools/objtool/special.c b/tools/objtool/special.c index 84f001d52322..50af4e1274b3 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -30,9 +30,9 @@ #define EX_ORIG_OFFSET 0 #define EX_NEW_OFFSET 4 -#define JUMP_ENTRY_SIZE 24 +#define JUMP_ENTRY_SIZE 16 #define JUMP_ORIG_OFFSET 0 -#define JUMP_NEW_OFFSET 8 +#define JUMP_NEW_OFFSET 4 #define ALT_ENTRY_SIZE 13 #define ALT_ORIG_OFFSET 0 diff --git a/tools/pci/Build b/tools/pci/Build new file mode 100644 index 000000000000..c375aea21790 --- /dev/null +++ b/tools/pci/Build @@ -0,0 +1 @@ +pcitest-y += pcitest.o diff --git a/tools/pci/Makefile b/tools/pci/Makefile new file mode 100644 index 000000000000..46e4c2f318c9 --- /dev/null +++ b/tools/pci/Makefile @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../scripts/Makefile.include + +bindir ?= /usr/bin + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := pcitest pcitest.sh +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) + +all: $(ALL_PROGRAMS) + +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +# +# We need the following to be outside of kernel tree +# +$(OUTPUT)include/linux/: ../../include/uapi/linux/ + mkdir -p $(OUTPUT)include/linux/ 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/pcitest.h $@ + +prepare: $(OUTPUT)include/linux/ + +PCITEST_IN := $(OUTPUT)pcitest-in.o +$(PCITEST_IN): prepare FORCE + $(Q)$(MAKE) $(build)=pcitest +$(OUTPUT)pcitest: $(PCITEST_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +clean: + rm -f $(ALL_PROGRAMS) + rm -rf $(OUTPUT)include/ + find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + +install: $(ALL_PROGRAMS) + install -d -m 755 $(DESTDIR)$(bindir); \ + for program in $(ALL_PROGRAMS); do \ + install $$program $(DESTDIR)$(bindir); \ + done + +FORCE: + +.PHONY: all install clean FORCE prepare diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c index af146bb03b4d..ec4d51f3308b 100644 --- a/tools/pci/pcitest.c +++ b/tools/pci/pcitest.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -48,17 +47,15 @@ struct pci_test { unsigned long size; }; -static int run_test(struct pci_test *test) +static void run_test(struct pci_test *test) { long ret; int fd; - struct timespec start, end; - double time; fd = open(test->device, O_RDWR); if (fd < 0) { perror("can't open PCI Endpoint Test device"); - return fd; + return; } if (test->barnum >= 0 && test->barnum <= 5) { diff --git a/tools/perf/Documentation/build-xed.txt b/tools/perf/Documentation/build-xed.txt new file mode 100644 index 000000000000..6222c1e7231f --- /dev/null +++ b/tools/perf/Documentation/build-xed.txt @@ -0,0 +1,19 @@ + +For --xed the xed tool is needed. Here is how to install it: + + $ git clone https://github.com/intelxed/mbuild.git mbuild + $ git clone https://github.com/intelxed/xed + $ cd xed + $ ./mfile.py --share + $ ./mfile.py examples + $ sudo ./mfile.py --prefix=/usr/local install + $ sudo ldconfig + $ sudo cp obj/examples/xed /usr/local/bin + +Basic xed testing: + + $ xed | head -3 + ERROR: required argument(s) were missing + Copyright (C) 2017, Intel Corporation. All rights reserved. + XED version: [v10.0-328-g7d62c8c49b7b] + $ diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 76971d2e4164..115eaacc455f 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -106,7 +106,7 @@ in transaction, respectively. While it is possible to create scripts to analyze the data, an alternative approach is available to export the data to a sqlite or postgresql database. Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, -and to script call-graph-from-sql.py for an example of using the database. +and to script exported-sql-viewer.py for an example of using the database. There is also script intel-pt-events.py which provides an example of how to unpack the raw data for power events and PTWRITE. diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index a3abe04c779d..c2182cbabde3 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -11,10 +11,11 @@ l synthesize last branch entries (use with i or x) s skip initial number of events - The default is all events i.e. the same as --itrace=ibxwpe + The default is all events i.e. the same as --itrace=ibxwpe, + except for perf script where it is --itrace=ce - In addition, the period (default 100000) for instructions events - can be specified in units of: + In addition, the period (default 100000, except for perf script where it is 1) + for instructions events can be specified in units of: i instructions t ticks diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 236b9b97dfdb..667c14e56031 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -55,7 +55,6 @@ counted. The following modifiers exist: S - read sample value (PERF_SAMPLE_READ) D - pin the event to the PMU W - group is weak and will fallback to non-group if not schedulable, - only supported in 'perf stat' for now. The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index afdafe2110a1..a2b37ce48094 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -383,6 +383,24 @@ include::itrace.txt[] will be printed. Each entry has function name and file/line. Enabled by default, disable with --no-inline. +--insn-trace:: + Show instruction stream for intel_pt traces. Combine with --xed to + show disassembly. + +--xed:: + Run xed disassembler on output. Requires installing the xed disassembler. + +--call-trace:: + Show call stream for intel_pt traces. The CPUs are interleaved, but + can be filtered with -C. + +--call-ret-trace:: + Show call and return stream for intel_pt traces. + +--graph-function:: + For itrace only show specified functions and their callees for + itrace. Multiple functions can be separated by comma. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 114fda12aa49..808b664343c9 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -242,6 +242,16 @@ Default is to monitor all CPUS. --hierarchy:: Enable hierarchy output. +--overwrite:: + Enable this to use just the most recent records, which helps in high core count + machines such as Knights Landing/Mill, but right now is disabled by default as + the pausing used in this technique is leading to loss of metadata events such + as PERF_RECORD_MMAP which makes 'perf top' unable to resolve samples, leading + to lots of unknown samples appearing on the UI. Enable this if you are in such + machines and profiling a workload that doesn't creates short lived threads and/or + doesn't uses many executable mmap operations. Work is being planed to solve + this situation, till then, this will remain disabled by default. + --force:: Don't do ownership validation. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 115db9e06ecd..e113450503d2 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -171,6 +171,11 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --kernel-syscall-graph:: Show the kernel callchains on the syscall exit path. +--max-events=N:: + Stop after processing N events. Note that strace-like events are considered + only at exit time or when a syscall is interrupted, i.e. in those cases this + option is equivalent to the number of lines printed. + --max-stack:: Set the stack depth limit when parsing the callchain, anything beyond the specified depth will be ignored. Note that at this point @@ -238,6 +243,68 @@ Trace syscalls, major and minor pagefaults: As you can see, there was major pagefault in python process, from CRYPTO_push_info_ routine which faulted somewhere in libcrypto.so. +Trace the first 4 open, openat or open_by_handle_at syscalls (in the future more syscalls may match here): + + $ perf trace -e open* --max-events 4 + [root@jouet perf]# trace -e open* --max-events 4 + 2272.992 ( 0.037 ms): gnome-shell/1370 openat(dfd: CWD, filename: /proc/self/stat) = 31 + 2277.481 ( 0.139 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65 + 3026.398 ( 0.076 ms): gnome-shell/3039 openat(dfd: CWD, filename: /proc/self/stat) = 65 + 4294.665 ( 0.015 ms): sed/15879 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3 + $ + +Trace the first minor page fault when running a workload: + + # perf trace -F min --max-stack=7 --max-events 1 sleep 1 + 0.000 ( 0.000 ms): sleep/18006 minfault [__clear_user+0x1a] => 0x5626efa56080 (?k) + __clear_user ([kernel.kallsyms]) + load_elf_binary ([kernel.kallsyms]) + search_binary_handler ([kernel.kallsyms]) + __do_execve_file.isra.33 ([kernel.kallsyms]) + __x64_sys_execve ([kernel.kallsyms]) + do_syscall_64 ([kernel.kallsyms]) + entry_SYSCALL_64 ([kernel.kallsyms]) + # + +Trace the next min page page fault to take place on the first CPU: + + # perf trace -F min --call-graph=dwarf --max-events 1 --cpu 0 + 0.000 ( 0.000 ms): Web Content/17136 minfault [js::gc::Chunk::fetchNextDecommittedArena+0x4b] => 0x7fbe6181b000 (?.) + js::gc::FreeSpan::initAsEmpty (inlined) + js::gc::Arena::setAsNotAllocated (inlined) + js::gc::Chunk::fetchNextDecommittedArena (/usr/lib64/firefox/libxul.so) + js::gc::Chunk::allocateArena (/usr/lib64/firefox/libxul.so) + js::gc::GCRuntime::allocateArena (/usr/lib64/firefox/libxul.so) + js::gc::ArenaLists::allocateFromArena (/usr/lib64/firefox/libxul.so) + js::gc::GCRuntime::tryNewTenuredThing (inlined) + js::AllocateString (/usr/lib64/firefox/libxul.so) + js::Allocate (inlined) + JSThinInlineString::new_<(js::AllowGC)1> (inlined) + AllocateInlineString<(js::AllowGC)1, unsigned char> (inlined) + js::ConcatStrings<(js::AllowGC)1> (/usr/lib64/firefox/libxul.so) + [0x18b26e6bc2bd] (/tmp/perf-17136.map) + # + +Trace the next two sched:sched_switch events, four block:*_plug events, the +next block:*_unplug and the next three net:*dev_queue events, this last one +with a backtrace of at most 16 entries, system wide: + + # perf trace -e sched:*switch/nr=2/,block:*_plug/nr=4/,block:*_unplug/nr=1/,net:*dev_queue/nr=3,max-stack=16/ + 0.000 :0/0 sched:sched_switch:swapper/2:0 [120] S ==> rcu_sched:10 [120] + 0.015 rcu_sched/10 sched:sched_switch:rcu_sched:10 [120] R ==> swapper/2:0 [120] + 254.198 irq/50-iwlwifi/680 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=66 + __dev_queue_xmit ([kernel.kallsyms]) + 273.977 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051f600 len=78 + __dev_queue_xmit ([kernel.kallsyms]) + 274.007 :0/0 net:net_dev_queue:dev=wlp3s0 skbaddr=0xffff93498051ff00 len=78 + __dev_queue_xmit ([kernel.kallsyms]) + 2930.140 kworker/u16:58/2722 block:block_plug:[kworker/u16:58] + 2930.162 kworker/u16:58/2722 block:block_unplug:[kworker/u16:58] 1 + 4466.094 jbd2/dm-2-8/748 block:block_plug:[jbd2/dm-2-8] + 8050.123 kworker/u16:30/2694 block:block_plug:[kworker/u16:30] + 8050.271 kworker/u16:30/2694 block:block_plug:[kworker/u16:30] + # + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script[1] diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index f6d1a03c7523..e30d20fb482d 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -833,7 +833,7 @@ ifndef NO_JVMTI JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | awk '{print $$3}') else ifneq (,$(wildcard /usr/sbin/alternatives)) - JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g') + JDIR=$(shell /usr/sbin/alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g') endif endif ifndef JDIR diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5224ade3d5af..d95655489f7e 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1,4 +1,5 @@ include ../scripts/Makefile.include +include ../scripts/Makefile.arch # The default target of this Makefile is... all: @@ -385,6 +386,8 @@ export INSTALL SHELL_PATH SHELL = $(SHELL_PATH) linux_uapi_dir := $(srctree)/tools/include/uapi/linux +asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic +arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ beauty_outdir := $(OUTPUT)trace/beauty/generated beauty_ioctl_outdir := $(beauty_outdir)/ioctl @@ -460,6 +463,18 @@ madvise_behavior_tbl := $(srctree)/tools/perf/trace/beauty/madvise_behavior.sh $(madvise_behavior_array): $(madvise_hdr_dir)/mman-common.h $(madvise_behavior_tbl) $(Q)$(SHELL) '$(madvise_behavior_tbl)' $(madvise_hdr_dir) > $@ +mmap_flags_array := $(beauty_outdir)/mmap_flags_array.c +mmap_flags_tbl := $(srctree)/tools/perf/trace/beauty/mmap_flags.sh + +$(mmap_flags_array): $(asm_generic_uapi_dir)/mman.h $(asm_generic_uapi_dir)/mman-common.h $(arch_asm_uapi_dir)/mman.h $(mmap_flags_tbl) + $(Q)$(SHELL) '$(mmap_flags_tbl)' $(asm_generic_uapi_dir) $(arch_asm_uapi_dir) > $@ + +mount_flags_array := $(beauty_outdir)/mount_flags_array.c +mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/mount_flags.sh + +$(mount_flags_array): $(linux_uapi_dir)/fs.h $(mount_flags_tbl) + $(Q)$(SHELL) '$(mount_flags_tbl)' $(linux_uapi_dir) > $@ + prctl_option_array := $(beauty_outdir)/prctl_option_array.c prctl_hdr_dir := $(srctree)/tools/include/uapi/linux/ prctl_option_tbl := $(srctree)/tools/perf/trace/beauty/prctl_option.sh @@ -577,6 +592,8 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(socket_ipproto_array) \ $(vhost_virtio_ioctl_array) \ $(madvise_behavior_array) \ + $(mmap_flags_array) \ + $(mount_flags_array) \ $(perf_ioctl_array) \ $(prctl_option_array) \ $(arch_errno_name_array) @@ -635,7 +652,7 @@ $(LIBPERF_IN): prepare FORCE $(LIB_FILE): $(LIBPERF_IN) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN) $(LIB_OBJS) -LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) +LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(LDFLAGS)' $(LIBTRACEEVENT): FORCE $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a @@ -779,7 +796,9 @@ endif ifndef NO_LIBBPF $(call QUIET_INSTALL, bpf-headers) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \ - $(INSTALL) include/bpf/*.h -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf' + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux'; \ + $(INSTALL) include/bpf/*.h -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \ + $(INSTALL) include/bpf/linux/*.h -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux' $(call QUIET_INSTALL, bpf-examples) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'; \ $(INSTALL) examples/bpf/*.c -t '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf' @@ -861,6 +880,8 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ $(OUTPUT)pmu-events/pmu-events.c \ $(OUTPUT)$(madvise_behavior_array) \ + $(OUTPUT)$(mmap_flags_array) \ + $(OUTPUT)$(mount_flags_array) \ $(OUTPUT)$(drm_ioctl_array) \ $(OUTPUT)$(pkey_alloc_access_rights_array) \ $(OUTPUT)$(sndrv_ctl_ioctl_array) \ diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 6688977e4ac7..76c6345a57d5 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -8,6 +8,63 @@ struct arm64_annotate { jump_insn; }; +static int arm64_mov__parse(struct arch *arch __maybe_unused, + struct ins_operands *ops, + struct map_symbol *ms __maybe_unused) +{ + char *s = strchr(ops->raw, ','), *target, *endptr; + + if (s == NULL) + return -1; + + *s = '\0'; + ops->source.raw = strdup(ops->raw); + *s = ','; + + if (ops->source.raw == NULL) + return -1; + + target = ++s; + ops->target.raw = strdup(target); + if (ops->target.raw == NULL) + goto out_free_source; + + ops->target.addr = strtoull(target, &endptr, 16); + if (endptr == target) + goto out_free_target; + + s = strchr(endptr, '<'); + if (s == NULL) + goto out_free_target; + endptr = strchr(s + 1, '>'); + if (endptr == NULL) + goto out_free_target; + + *endptr = '\0'; + *s = ' '; + ops->target.name = strdup(s); + *s = '<'; + *endptr = '>'; + if (ops->target.name == NULL) + goto out_free_target; + + return 0; + +out_free_target: + zfree(&ops->target.raw); +out_free_source: + zfree(&ops->source.raw); + return -1; +} + +static int mov__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops); + +static struct ins_ops arm64_mov_ops = { + .parse = arm64_mov__parse, + .scnprintf = mov__scnprintf, +}; + static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name) { struct arm64_annotate *arm = arch->priv; @@ -21,7 +78,7 @@ static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const else if (!strcmp(name, "ret")) ops = &ret_ops; else - return NULL; + ops = &arm64_mov_ops; arch__associate_ins_ops(arch, name, ops); return ops; diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 2dbb8cade048..c88fd32563eb 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -23,7 +23,7 @@ create_table_from_c() { local sc nr last_sc - create_table_exe=`mktemp /tmp/create-table-XXXXXX` + create_table_exe=`mktemp ${TMPDIR:-/tmp}/create-table-XXXXXX` { diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h index 853b95d1e139..2011376c7ab5 100644 --- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h +++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h @@ -15,7 +15,6 @@ {0x400, "INST_STORAGE"}, \ {0x480, "INST_SEGMENT"}, \ {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ {0x502, "EXTERNAL_HV"}, \ {0x600, "ALIGNMENT"}, \ {0x700, "PROGRAM"}, \ diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index cee4e2f7c057..de0dd66dbb48 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -100,8 +100,6 @@ out_free_source: return -1; } -static int mov__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops); static struct ins_ops s390_mov_ops = { .parse = s390_mov__parse, diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile index 7fbca175099e..275dea7ff59a 100644 --- a/tools/perf/arch/sparc/Makefile +++ b/tools/perf/arch/sparc/Makefile @@ -1,3 +1,5 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 endif + +PERF_HAVE_JITDUMP := 1 diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c new file mode 100644 index 000000000000..2614c010c235 --- /dev/null +++ b/tools/perf/arch/sparc/annotate/instructions.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +static int is_branch_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if (cond[0] == 'a' && cond[1] == '\0') + return 1; + + if (cond[0] == 'c' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'e' && + (cond[1] == '\0' || + (cond[1] == 'q' && cond[2] == '\0'))) + return 1; + + if (cond[0] == 'g' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'l' && + (cond[1] == '\0' || + (cond[1] == 't' && cond[2] == '\0') || + (cond[1] == 'u' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'n' && + (cond[1] == '\0' || + (cond[1] == 'e' && cond[2] == '\0') || + (cond[1] == 'z' && cond[2] == '\0') || + (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0'))) + return 1; + + if (cond[0] == 'b' && + cond[1] == 'p' && + cond[2] == 'o' && + cond[3] == 's' && + cond[4] == '\0') + return 1; + + if (cond[0] == 'v' && + (cond[1] == 'c' || cond[1] == 's') && + cond[2] == '\0') + return 1; + + if (cond[0] == 'b' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_reg_cond(const char *cond) +{ + if ((cond[0] == 'n' || cond[0] == 'l') && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + if (cond[0] == 'z' && + cond[1] == '\0') + return 1; + + if ((cond[0] == 'g' || cond[0] == 'l') && + cond[1] == 'e' && + cond[2] == 'z' && + cond[3] == '\0') + return 1; + + if (cond[0] == 'g' && + cond[1] == 'z' && + cond[2] == '\0') + return 1; + + return 0; +} + +static int is_branch_float_cond(const char *cond) +{ + if (cond[0] == '\0') + return 1; + + if ((cond[0] == 'a' || cond[0] == 'e' || + cond[0] == 'z' || cond[0] == 'g' || + cond[0] == 'l' || cond[0] == 'n' || + cond[0] == 'o' || cond[0] == 'u') && + cond[1] == '\0') + return 1; + + if (((cond[0] == 'g' && cond[1] == 'e') || + (cond[0] == 'l' && (cond[1] == 'e' || + cond[1] == 'g')) || + (cond[0] == 'n' && (cond[1] == 'e' || + cond[1] == 'z')) || + (cond[0] == 'u' && (cond[1] == 'e' || + cond[1] == 'g' || + cond[1] == 'l'))) && + cond[2] == '\0') + return 1; + + if (cond[0] == 'u' && + (cond[1] == 'g' || cond[1] == 'l') && + cond[2] == 'e' && + cond[3] == '\0') + return 1; + + return 0; +} + +static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name) +{ + struct ins_ops *ops = NULL; + + if (!strcmp(name, "call") || + !strcmp(name, "jmp") || + !strcmp(name, "jmpl")) { + ops = &call_ops; + } else if (!strcmp(name, "ret") || + !strcmp(name, "retl") || + !strcmp(name, "return")) { + ops = &ret_ops; + } else if (!strcmp(name, "mov")) { + ops = &mov_ops; + } else { + if (name[0] == 'c' && + (name[1] == 'w' || name[1] == 'x')) + name += 2; + + if (name[0] == 'b') { + const char *cond = name + 1; + + if (cond[0] == 'r') { + if (is_branch_reg_cond(cond + 1)) + ops = &jump_ops; + } else if (is_branch_cond(cond)) { + ops = &jump_ops; + } + } else if (name[0] == 'f' && name[1] == 'b') { + if (is_branch_float_cond(name + 2)) + ops = &jump_ops; + } + } + + if (ops) + arch__associate_ins_ops(arch, name, ops); + + return ops; +} + +static int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) +{ + if (!arch->initialized) { + arch->initialized = true; + arch->associate_instruction_ops = sparc__associate_instruction_ops; + arch->objdump.comment_char = '#'; + } + + return 0; +} diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 830481b8db26..93d679eaf1f4 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -283,12 +283,11 @@ out_put: return ret; } -static int process_feature_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static int process_feature_event(struct perf_session *session, + union perf_event *event) { if (event->feat.feat_id < HEADER_LAST_FEATURE) - return perf_event__process_feature(tool, event, session); + return perf_event__process_feature(session, event); return 0; } diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a3b346359ba0..eda41673c4f3 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -86,12 +86,10 @@ static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused, } #endif -static int perf_event__repipe_op2_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session - __maybe_unused) +static int perf_event__repipe_op2_synth(struct perf_session *session, + union perf_event *event) { - return perf_event__repipe_synth(tool, event); + return perf_event__repipe_synth(session->tool, event); } static int perf_event__repipe_attr(struct perf_tool *tool, @@ -133,10 +131,10 @@ static int copy_bytes(struct perf_inject *inject, int fd, off_t size) return 0; } -static s64 perf_event__repipe_auxtrace(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static s64 perf_event__repipe_auxtrace(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_inject *inject = container_of(tool, struct perf_inject, tool); int ret; @@ -174,9 +172,8 @@ static s64 perf_event__repipe_auxtrace(struct perf_tool *tool, #else static s64 -perf_event__repipe_auxtrace(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *session __maybe_unused) +perf_event__repipe_auxtrace(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) { pr_err("AUX area tracing not supported\n"); return -EINVAL; @@ -362,26 +359,24 @@ static int perf_event__repipe_exit(struct perf_tool *tool, return err; } -static int perf_event__repipe_tracing_data(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static int perf_event__repipe_tracing_data(struct perf_session *session, + union perf_event *event) { int err; - perf_event__repipe_synth(tool, event); - err = perf_event__process_tracing_data(tool, event, session); + perf_event__repipe_synth(session->tool, event); + err = perf_event__process_tracing_data(session, event); return err; } -static int perf_event__repipe_id_index(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static int perf_event__repipe_id_index(struct perf_session *session, + union perf_event *event) { int err; - perf_event__repipe_synth(tool, event); - err = perf_event__process_id_index(tool, event, session); + perf_event__repipe_synth(session->tool, event); + err = perf_event__process_id_index(session, event); return err; } @@ -803,7 +798,8 @@ int cmd_inject(int argc, const char **argv) "kallsyms pathname"), OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"), OPT_CALLBACK_OPTARG(0, "itrace", &inject.itrace_synth_opts, - NULL, "opts", "Instruction Tracing options", + NULL, "opts", "Instruction Tracing options\n" + ITRACE_HELP, itrace_parse_synth_opts), OPT_BOOLEAN(0, "strip", &inject.strip, "strip non-synthesized events (use with --itrace)"), diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 22ebeb92ac51..488779bc4c8d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -106,9 +106,12 @@ static bool switch_output_time(struct record *rec) trigger_is_ready(&switch_output_trigger); } -static int record__write(struct record *rec, void *bf, size_t size) +static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused, + void *bf, size_t size) { - if (perf_data__write(rec->session->data, bf, size) < 0) { + struct perf_data_file *file = &rec->session->data->file; + + if (perf_data_file__write(file, bf, size) < 0) { pr_err("failed to write perf data, error: %m\n"); return -1; } @@ -127,15 +130,15 @@ static int process_synthesized_event(struct perf_tool *tool, struct machine *machine __maybe_unused) { struct record *rec = container_of(tool, struct record, tool); - return record__write(rec, event, event->header.size); + return record__write(rec, NULL, event, event->header.size); } -static int record__pushfn(void *to, void *bf, size_t size) +static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size) { struct record *rec = to; rec->samples++; - return record__write(rec, bf, size); + return record__write(rec, map, bf, size); } static volatile int done; @@ -170,6 +173,7 @@ static void record__sig_exit(void) #ifdef HAVE_AUXTRACE_SUPPORT static int record__process_auxtrace(struct perf_tool *tool, + struct perf_mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2) { @@ -197,21 +201,21 @@ static int record__process_auxtrace(struct perf_tool *tool, if (padding) padding = 8 - padding; - record__write(rec, event, event->header.size); - record__write(rec, data1, len1); + record__write(rec, map, event, event->header.size); + record__write(rec, map, data1, len1); if (len2) - record__write(rec, data2, len2); - record__write(rec, &pad, padding); + record__write(rec, map, data2, len2); + record__write(rec, map, &pad, padding); return 0; } static int record__auxtrace_mmap_read(struct record *rec, - struct auxtrace_mmap *mm) + struct perf_mmap *map) { int ret; - ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool, + ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, record__process_auxtrace); if (ret < 0) return ret; @@ -223,11 +227,11 @@ static int record__auxtrace_mmap_read(struct record *rec, } static int record__auxtrace_mmap_read_snapshot(struct record *rec, - struct auxtrace_mmap *mm) + struct perf_mmap *map) { int ret; - ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool, + ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, record__process_auxtrace, rec->opts.auxtrace_snapshot_size); if (ret < 0) @@ -245,13 +249,12 @@ static int record__auxtrace_read_snapshot_all(struct record *rec) int rc = 0; for (i = 0; i < rec->evlist->nr_mmaps; i++) { - struct auxtrace_mmap *mm = - &rec->evlist->mmap[i].auxtrace_mmap; + struct perf_mmap *map = &rec->evlist->mmap[i]; - if (!mm->base) + if (!map->auxtrace_mmap.base) continue; - if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) { + if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { rc = -1; goto out; } @@ -295,7 +298,7 @@ static int record__auxtrace_init(struct record *rec) static inline int record__auxtrace_mmap_read(struct record *rec __maybe_unused, - struct auxtrace_mmap *mm __maybe_unused) + struct perf_mmap *map __maybe_unused) { return 0; } @@ -388,7 +391,12 @@ try_again: ui__warning("%s\n", msg); goto try_again; } - + if ((errno == EINVAL || errno == EBADF) && + pos->leader != pos && + pos->weak_group) { + pos = perf_evlist__reset_weak_group(evlist, pos); + goto try_again; + } rc = -errno; perf_evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg)); @@ -529,17 +537,17 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli return 0; for (i = 0; i < evlist->nr_mmaps; i++) { - struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap; + struct perf_mmap *map = &maps[i]; - if (maps[i].base) { - if (perf_mmap__push(&maps[i], rec, record__pushfn) != 0) { + if (map->base) { + if (perf_mmap__push(map, rec, record__pushfn) != 0) { rc = -1; goto out; } } - if (mm->base && !rec->opts.auxtrace_snapshot_mode && - record__auxtrace_mmap_read(rec, mm) != 0) { + if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && + record__auxtrace_mmap_read(rec, map) != 0) { rc = -1; goto out; } @@ -550,7 +558,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli * at least one event. */ if (bytes_written != rec->bytes_written) - rc = record__write(rec, &finished_round_event, sizeof(finished_round_event)); + rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); if (overwrite) perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); @@ -589,6 +597,9 @@ static void record__init_features(struct record *rec) if (!rec->opts.full_auxtrace) perf_header__clear_feat(&session->header, HEADER_AUXTRACE); + if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) + perf_header__clear_feat(&session->header, HEADER_CLOCKID); + perf_header__clear_feat(&session->header, HEADER_STAT); } @@ -758,7 +769,7 @@ static int record__synthesize(struct record *rec, bool tail) * We need to synthesize events first, because some * features works on top of them (on report side). */ - err = perf_event__synthesize_attrs(tool, session, + err = perf_event__synthesize_attrs(tool, rec->evlist, process_synthesized_event); if (err < 0) { pr_err("Couldn't synthesize attrs.\n"); @@ -894,6 +905,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) record__init_features(rec); + if (rec->opts.use_clockid && rec->opts.clockid_res_ns) + session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; + if (forks) { err = perf_evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe, @@ -1334,6 +1348,19 @@ static const struct clockid_map clockids[] = { CLOCKID_END, }; +static int get_clockid_res(clockid_t clk_id, u64 *res_ns) +{ + struct timespec res; + + *res_ns = 0; + if (!clock_getres(clk_id, &res)) + *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; + else + pr_warning("WARNING: Failed to determine specified clock resolution.\n"); + + return 0; +} + static int parse_clockid(const struct option *opt, const char *str, int unset) { struct record_opts *opts = (struct record_opts *)opt->value; @@ -1357,7 +1384,7 @@ static int parse_clockid(const struct option *opt, const char *str, int unset) /* if its a number, we're done */ if (sscanf(str, "%d", &opts->clockid) == 1) - return 0; + return get_clockid_res(opts->clockid, &opts->clockid_res_ns); /* allow a "CLOCK_" prefix to the name */ if (!strncasecmp(str, "CLOCK_", 6)) @@ -1366,7 +1393,8 @@ static int parse_clockid(const struct option *opt, const char *str, int unset) for (cm = clockids; cm->name; cm++) { if (!strcasecmp(str, cm->name)) { opts->clockid = cm->clockid; - return 0; + return get_clockid_res(opts->clockid, + &opts->clockid_res_ns); } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 76e12bcd1765..257c9c18cb7e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -201,14 +201,13 @@ static void setup_forced_leader(struct report *report, perf_evlist__force_leader(evlist); } -static int process_feature_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +static int process_feature_event(struct perf_session *session, + union perf_event *event) { - struct report *rep = container_of(tool, struct report, tool); + struct report *rep = container_of(session->tool, struct report, tool); if (event->feat.feat_id < HEADER_LAST_FEATURE) - return perf_event__process_feature(tool, event, session); + return perf_event__process_feature(session, event); if (event->feat.feat_id != HEADER_LAST_FEATURE) { pr_err("failed: wrong feature ID: %" PRIu64 "\n", @@ -981,6 +980,7 @@ int cmd_report(int argc, const char **argv) .id_index = perf_event__process_id_index, .auxtrace_info = perf_event__process_auxtrace_info, .auxtrace = perf_event__process_auxtrace, + .event_update = perf_event__process_event_update, .feature = process_feature_event, .ordered_events = true, .ordering_requires_timestamps = true, @@ -1105,7 +1105,7 @@ int cmd_report(int argc, const char **argv) OPT_CALLBACK(0, "percentage", NULL, "relative|absolute", "how to display percentage of filtered entries", parse_filter_percentage), OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", - "Instruction Tracing options", + "Instruction Tracing options\n" ITRACE_HELP, itrace_parse_synth_opts), OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, "Show full source file name path for source lines"), diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ba481d73f910..b5bc85bd0bbe 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "sane_ctype.h" @@ -406,9 +407,10 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, PERF_OUTPUT_WEIGHT)) return -EINVAL; - if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) { + if (PRINT_FIELD(SYM) && + !(evsel->attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { pr_err("Display of symbols requested but neither sample IP nor " - "sample address\nis selected. Hence, no addresses to convert " + "sample address\navailable. Hence, no addresses to convert " "to symbols.\n"); return -EINVAL; } @@ -417,10 +419,9 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, "selected.\n"); return -EINVAL; } - if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR) && - !PRINT_FIELD(BRSTACK) && !PRINT_FIELD(BRSTACKSYM) && !PRINT_FIELD(BRSTACKOFF)) { - pr_err("Display of DSO requested but no address to convert. Select\n" - "sample IP, sample address, brstack, brstacksym, or brstackoff.\n"); + if (PRINT_FIELD(DSO) && + !(evsel->attr.sample_type & (PERF_SAMPLE_IP|PERF_SAMPLE_ADDR))) { + pr_err("Display of DSO requested but no address to convert.\n"); return -EINVAL; } if (PRINT_FIELD(SRCLINE) && !PRINT_FIELD(IP)) { @@ -912,7 +913,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct perf_insn *x, u8 *inbuf, int len, - int insn, FILE *fp) + int insn, FILE *fp, int *total_cycles) { int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t#%s%s%s%s", ip, dump_insn(x, ip, inbuf, len, NULL), @@ -921,7 +922,8 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, en->flags.in_tx ? " INTX" : "", en->flags.abort ? " ABORT" : ""); if (en->flags.cycles) { - printed += fprintf(fp, " %d cycles", en->flags.cycles); + *total_cycles += en->flags.cycles; + printed += fprintf(fp, " %d cycles [%d]", en->flags.cycles, *total_cycles); if (insn) printed += fprintf(fp, " %.2f IPC", (float)insn / en->flags.cycles); } @@ -978,6 +980,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, u8 buffer[MAXBB]; unsigned off; struct symbol *lastsym = NULL; + int total_cycles = 0; if (!(br && br->nr)) return 0; @@ -998,7 +1001,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, x.cpumode, x.cpu, &lastsym, attr, fp); printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], - &x, buffer, len, 0, fp); + &x, buffer, len, 0, fp, &total_cycles); } /* Print all blocks */ @@ -1026,7 +1029,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn, fp); + printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn, fp, + &total_cycles); break; } else { printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", ip, @@ -1104,6 +1108,35 @@ out: return printed; } +static const char *resolve_branch_sym(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al, + u64 *ip) +{ + struct addr_location addr_al; + struct perf_event_attr *attr = &evsel->attr; + const char *name = NULL; + + if (sample->flags & (PERF_IP_FLAG_CALL | PERF_IP_FLAG_TRACE_BEGIN)) { + if (sample_addr_correlates_sym(attr)) { + thread__resolve(thread, &addr_al, sample); + if (addr_al.sym) + name = addr_al.sym->name; + else + *ip = sample->addr; + } else { + *ip = sample->addr; + } + } else if (sample->flags & (PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_END)) { + if (al->sym) + name = al->sym->name; + else + *ip = sample->ip; + } + return name; +} + static int perf_sample__fprintf_callindent(struct perf_sample *sample, struct perf_evsel *evsel, struct thread *thread, @@ -1111,10 +1144,10 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, { struct perf_event_attr *attr = &evsel->attr; size_t depth = thread_stack__depth(thread); - struct addr_location addr_al; const char *name = NULL; static int spacing; int len = 0; + int dlen = 0; u64 ip = 0; /* @@ -1124,21 +1157,12 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, if (thread->ts && sample->flags & PERF_IP_FLAG_RETURN) depth += 1; - if (sample->flags & (PERF_IP_FLAG_CALL | PERF_IP_FLAG_TRACE_BEGIN)) { - if (sample_addr_correlates_sym(attr)) { - thread__resolve(thread, &addr_al, sample); - if (addr_al.sym) - name = addr_al.sym->name; - else - ip = sample->addr; - } else { - ip = sample->addr; - } - } else if (sample->flags & (PERF_IP_FLAG_RETURN | PERF_IP_FLAG_TRACE_END)) { - if (al->sym) - name = al->sym->name; - else - ip = sample->ip; + name = resolve_branch_sym(sample, evsel, thread, al, &ip); + + if (PRINT_FIELD(DSO) && !(PRINT_FIELD(IP) || PRINT_FIELD(ADDR))) { + dlen += fprintf(fp, "("); + dlen += map__fprintf_dsoname(al->map, fp); + dlen += fprintf(fp, ")\t"); } if (name) @@ -1159,7 +1183,7 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, if (len < spacing) len += fprintf(fp, "%*s", spacing - len, ""); - return len; + return len + dlen; } static int perf_sample__fprintf_insn(struct perf_sample *sample, @@ -1255,6 +1279,18 @@ static struct { {0, NULL} }; +static const char *sample_flags_to_name(u32 flags) +{ + int i; + + for (i = 0; sample_flags[i].name ; i++) { + if (sample_flags[i].flags == flags) + return sample_flags[i].name; + } + + return NULL; +} + static int perf_sample__fprintf_flags(u32 flags, FILE *fp) { const char *chars = PERF_IP_FLAG_CHARS; @@ -1264,11 +1300,20 @@ static int perf_sample__fprintf_flags(u32 flags, FILE *fp) char str[33]; int i, pos = 0; - for (i = 0; sample_flags[i].name ; i++) { - if (sample_flags[i].flags == (flags & ~PERF_IP_FLAG_IN_TX)) { - name = sample_flags[i].name; - break; - } + name = sample_flags_to_name(flags & ~PERF_IP_FLAG_IN_TX); + if (name) + return fprintf(fp, " %-15s%4s ", name, in_tx ? "(x)" : ""); + + if (flags & PERF_IP_FLAG_TRACE_BEGIN) { + name = sample_flags_to_name(flags & ~(PERF_IP_FLAG_IN_TX | PERF_IP_FLAG_TRACE_BEGIN)); + if (name) + return fprintf(fp, " tr strt %-7s%4s ", name, in_tx ? "(x)" : ""); + } + + if (flags & PERF_IP_FLAG_TRACE_END) { + name = sample_flags_to_name(flags & ~(PERF_IP_FLAG_IN_TX | PERF_IP_FLAG_TRACE_END)); + if (name) + return fprintf(fp, " tr end %-7s%4s ", name, in_tx ? "(x)" : ""); } for (i = 0; i < n; i++, flags >>= 1) { @@ -1281,10 +1326,7 @@ static int perf_sample__fprintf_flags(u32 flags, FILE *fp) } str[pos] = 0; - if (name) - return fprintf(fp, " %-7s%4s ", name, in_tx ? "(x)" : ""); - - return fprintf(fp, " %-11s ", str); + return fprintf(fp, " %-19s ", str); } struct printer_data { @@ -1544,7 +1586,8 @@ struct metric_ctx { FILE *fp; }; -static void script_print_metric(void *ctx, const char *color, +static void script_print_metric(struct perf_stat_config *config __maybe_unused, + void *ctx, const char *color, const char *fmt, const char *unit, double val) { @@ -1562,7 +1605,8 @@ static void script_print_metric(void *ctx, const char *color, fprintf(mctx->fp, " %s\n", unit); } -static void script_new_line(void *ctx) +static void script_new_line(struct perf_stat_config *config __maybe_unused, + void *ctx) { struct metric_ctx *mctx = ctx; @@ -1608,7 +1652,7 @@ static void perf_sample__fprint_metric(struct perf_script *script, evsel_script(evsel)->val = val; if (evsel_script(evsel->leader)->gnum == evsel->leader->nr_members) { for_each_group_member (ev2, evsel->leader) { - perf_stat__print_shadow_stats(ev2, + perf_stat__print_shadow_stats(&stat_config, ev2, evsel_script(ev2)->val, sample->cpu, &ctx, @@ -1619,6 +1663,47 @@ static void perf_sample__fprint_metric(struct perf_script *script, } } +static bool show_event(struct perf_sample *sample, + struct perf_evsel *evsel, + struct thread *thread, + struct addr_location *al) +{ + int depth = thread_stack__depth(thread); + + if (!symbol_conf.graph_function) + return true; + + if (thread->filter) { + if (depth <= thread->filter_entry_depth) { + thread->filter = false; + return false; + } + return true; + } else { + const char *s = symbol_conf.graph_function; + u64 ip; + const char *name = resolve_branch_sym(sample, evsel, thread, al, + &ip); + unsigned nlen; + + if (!name) + return false; + nlen = strlen(name); + while (*s) { + unsigned len = strcspn(s, ","); + if (nlen == len && !strncmp(name, s, len)) { + thread->filter = true; + thread->filter_entry_depth = depth; + return true; + } + s += len; + if (*s == ',') + s++; + } + return false; + } +} + static void process_event(struct perf_script *script, struct perf_sample *sample, struct perf_evsel *evsel, struct addr_location *al, @@ -1633,6 +1718,9 @@ static void process_event(struct perf_script *script, if (output[type].fields == 0) return; + if (!show_event(sample, evsel, thread, al)) + return; + ++es->samples; perf_sample__fprintf_start(sample, thread, evsel, @@ -1710,6 +1798,9 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(METRIC)) perf_sample__fprint_metric(script, thread, evsel, sample, fp); + + if (verbose) + fflush(fp); } static struct scripting_ops *scripting_ops; @@ -2489,6 +2580,8 @@ parse: output[j].fields &= ~all_output_options[i].field; else output[j].fields |= all_output_options[i].field; + output[j].user_set = true; + output[j].wildcard_set = true; } } } else { @@ -2499,7 +2592,8 @@ parse: rc = -EINVAL; goto out; } - output[type].fields |= all_output_options[i].field; + output[type].user_set = true; + output[type].wildcard_set = true; } } @@ -2963,9 +3057,8 @@ static void script__setup_sample_type(struct perf_script *script) } } -static int process_stat_round_event(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +static int process_stat_round_event(struct perf_session *session, + union perf_event *event) { struct stat_round_event *round = &event->stat_round; struct perf_evsel *counter; @@ -2979,9 +3072,8 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused, return 0; } -static int process_stat_config_event(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session __maybe_unused) +static int process_stat_config_event(struct perf_session *session __maybe_unused, + union perf_event *event) { perf_event__read_stat_config(&stat_config, &event->stat_config); return 0; @@ -3007,10 +3099,10 @@ static int set_maps(struct perf_script *script) } static -int process_thread_map_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +int process_thread_map_event(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); if (script->threads) { @@ -3026,10 +3118,10 @@ int process_thread_map_event(struct perf_tool *tool, } static -int process_cpu_map_event(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session __maybe_unused) +int process_cpu_map_event(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); if (script->cpus) { @@ -3044,21 +3136,21 @@ int process_cpu_map_event(struct perf_tool *tool __maybe_unused, return set_maps(script); } -static int process_feature_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static int process_feature_event(struct perf_session *session, + union perf_event *event) { if (event->feat.feat_id < HEADER_LAST_FEATURE) - return perf_event__process_feature(tool, event, session); + return perf_event__process_feature(session, event); return 0; } #ifdef HAVE_AUXTRACE_SUPPORT -static int perf_script__process_auxtrace_info(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +static int perf_script__process_auxtrace_info(struct perf_session *session, + union perf_event *event) { - int ret = perf_event__process_auxtrace_info(tool, event, session); + struct perf_tool *tool = session->tool; + + int ret = perf_event__process_auxtrace_info(session, event); if (ret == 0) { struct perf_script *script = container_of(tool, struct perf_script, tool); @@ -3072,6 +3164,44 @@ static int perf_script__process_auxtrace_info(struct perf_tool *tool, #define perf_script__process_auxtrace_info 0 #endif +static int parse_insn_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "+insn,-event,-period", 0); + itrace_parse_synth_opts(opt, "i0ns", 0); + nanosecs = true; + return 0; +} + +static int parse_xed(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + force_pager("xed -F insn: -A -64 | less"); + return 0; +} + +static int parse_call_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent", 0); + itrace_parse_synth_opts(opt, "cewp", 0); + nanosecs = true; + return 0; +} + +static int parse_callret_trace(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent,+flags", 0); + itrace_parse_synth_opts(opt, "crewp", 0); + nanosecs = true; + return 0; +} + int cmd_script(int argc, const char **argv) { bool show_full_info = false; @@ -3081,7 +3211,10 @@ int cmd_script(int argc, const char **argv) char *rec_script_path = NULL; char *rep_script_path = NULL; struct perf_session *session; - struct itrace_synth_opts itrace_synth_opts = { .set = false, }; + struct itrace_synth_opts itrace_synth_opts = { + .set = false, + .default_no_sample = true, + }; char *script_path = NULL; const char **__argv; int i, j, err = 0; @@ -3156,6 +3289,16 @@ int cmd_script(int argc, const char **argv) "system-wide collection from all CPUs"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), + OPT_CALLBACK_OPTARG(0, "insn-trace", &itrace_synth_opts, NULL, NULL, + "Decode instructions from itrace", parse_insn_trace), + OPT_CALLBACK_OPTARG(0, "xed", NULL, NULL, NULL, + "Run xed disassembler on output", parse_xed), + OPT_CALLBACK_OPTARG(0, "call-trace", &itrace_synth_opts, NULL, NULL, + "Decode calls from from itrace", parse_call_trace), + OPT_CALLBACK_OPTARG(0, "call-ret-trace", &itrace_synth_opts, NULL, NULL, + "Decode calls and returns from itrace", parse_callret_trace), + OPT_STRING(0, "graph-function", &symbol_conf.graph_function, "symbol[,symbol...]", + "Only print symbols and callees with --call-trace/--call-ret-trace"), OPT_STRING(0, "stop-bt", &symbol_conf.bt_stop_list_str, "symbol[,symbol...]", "Stop display of callgraph at these symbols"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), @@ -3193,7 +3336,7 @@ int cmd_script(int argc, const char **argv) OPT_BOOLEAN(0, "ns", &nanosecs, "Use 9 decimal places when displaying time"), OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", - "Instruction Tracing options", + "Instruction Tracing options\n" ITRACE_HELP, itrace_parse_synth_opts), OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, "Show full source file name path for source lines"), @@ -3389,8 +3532,10 @@ int cmd_script(int argc, const char **argv) exit(-1); } - if (!script_name) + if (!script_name) { setup_pager(); + use_browser = 0; + } session = perf_session__new(&data, false, &script.tool); if (session == NULL) @@ -3411,7 +3556,8 @@ int cmd_script(int argc, const char **argv) script.session = session; script__setup_sample_type(&script); - if (output[PERF_TYPE_HARDWARE].fields & PERF_OUTPUT_CALLINDENT) + if ((output[PERF_TYPE_HARDWARE].fields & PERF_OUTPUT_CALLINDENT) || + symbol_conf.graph_function) itrace_synth_opts.thread_stack = true; session->itrace_synth_opts = &itrace_synth_opts; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d097b5b47eb8..a635abfa77b6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -88,8 +88,6 @@ #include "sane_ctype.h" #define DEFAULT_SEPARATOR " " -#define CNTR_NOT_SUPPORTED "" -#define CNTR_NOT_COUNTED "" #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi" static void print_counters(struct timespec *ts, int argc, const char **argv); @@ -137,54 +135,30 @@ static const char *smi_cost_attrs = { static struct perf_evlist *evsel_list; -static struct rblist metric_events; - static struct target target = { .uid = UINT_MAX, }; -typedef int (*aggr_get_id_t)(struct cpu_map *m, int cpu); - #define METRIC_ONLY_LEN 20 -static int run_count = 1; -static bool no_inherit = false; static volatile pid_t child_pid = -1; -static bool null_run = false; static int detailed_run = 0; static bool transaction_run; static bool topdown_run = false; static bool smi_cost = false; static bool smi_reset = false; -static bool big_num = true; static int big_num_opt = -1; -static const char *csv_sep = NULL; -static bool csv_output = false; static bool group = false; static const char *pre_cmd = NULL; static const char *post_cmd = NULL; static bool sync_run = false; -static unsigned int initial_delay = 0; -static unsigned int unit_width = 4; /* strlen("unit") */ static bool forever = false; -static bool metric_only = false; static bool force_metric_only = false; -static bool no_merge = false; -static bool walltime_run_table = false; static struct timespec ref_time; -static struct cpu_map *aggr_map; -static aggr_get_id_t aggr_get_id; static bool append_file; static bool interval_count; -static bool interval_clear; static const char *output_name; static int output_fd; -static int print_free_counters_hint; -static int print_mixed_hw_group_error; -static u64 *walltime_run; -static bool ru_display = false; -static struct rusage ru_data; -static unsigned int metric_only_len = METRIC_ONLY_LEN; struct perf_stat { bool record; @@ -204,15 +178,15 @@ static struct perf_stat perf_stat; static volatile int done = 0; static struct perf_stat_config stat_config = { - .aggr_mode = AGGR_GLOBAL, - .scale = true, + .aggr_mode = AGGR_GLOBAL, + .scale = true, + .unit_width = 4, /* strlen("unit") */ + .run_count = 1, + .metric_only_len = METRIC_ONLY_LEN, + .walltime_nsecs_stats = &walltime_nsecs_stats, + .big_num = true, }; -static bool is_duration_time(struct perf_evsel *evsel) -{ - return !strcmp(evsel->name, "duration_time"); -} - static inline void diff_timespec(struct timespec *r, struct timespec *a, struct timespec *b) { @@ -236,66 +210,6 @@ static void perf_stat__reset_stats(void) perf_stat__reset_shadow_per_stat(&stat_config.stats[i]); } -static int create_perf_stat_counter(struct perf_evsel *evsel) -{ - struct perf_event_attr *attr = &evsel->attr; - struct perf_evsel *leader = evsel->leader; - - if (stat_config.scale) { - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; - } - - /* - * The event is part of non trivial group, let's enable - * the group read (for leader) and ID retrieval for all - * members. - */ - if (leader->nr_members > 1) - attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP; - - attr->inherit = !no_inherit; - - /* - * Some events get initialized with sample_(period/type) set, - * like tracepoints. Clear it up for counting. - */ - attr->sample_period = 0; - - /* - * But set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless - * while avoiding that older tools show confusing messages. - * - * However for pipe sessions we need to keep it zero, - * because script's perf_evsel__check_attr is triggered - * by attr->sample_type != 0, and we can't run it on - * stat sessions. - */ - if (!(STAT_RECORD && perf_stat.data.is_pipe)) - attr->sample_type = PERF_SAMPLE_IDENTIFIER; - - /* - * Disabling all counters initially, they will be enabled - * either manually by us or by kernel via enable_on_exec - * set later. - */ - if (perf_evsel__is_group_leader(evsel)) { - attr->disabled = 1; - - /* - * In case of initial_delay we enable tracee - * events manually. - */ - if (target__none(&target) && !initial_delay) - attr->enable_on_exec = 1; - } - - if (target__has_cpu(&target) && !target__has_per_thread(&target)) - return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); - - return perf_evsel__open_per_thread(evsel, evsel_list->threads); -} - static int process_synthesized_event(struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, @@ -428,15 +342,15 @@ static void process_interval(void) static void enable_counters(void) { - if (initial_delay) - usleep(initial_delay * USEC_PER_MSEC); + if (stat_config.initial_delay) + usleep(stat_config.initial_delay * USEC_PER_MSEC); /* * We need to enable counters only if: * - we don't have tracee (attaching to task or cpu) * - we have initial delay configured */ - if (!target__none(&target) || initial_delay) + if (!target__none(&target) || stat_config.initial_delay) perf_evlist__enable(evsel_list); } @@ -464,109 +378,31 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf workload_exec_errno = info->si_value.sival_int; } -static int perf_stat_synthesize_config(bool is_pipe) -{ - int err; - - if (is_pipe) { - err = perf_event__synthesize_attrs(NULL, perf_stat.session, - process_synthesized_event); - if (err < 0) { - pr_err("Couldn't synthesize attrs.\n"); - return err; - } - } - - err = perf_event__synthesize_extra_attr(NULL, - evsel_list, - process_synthesized_event, - is_pipe); - - err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads, - process_synthesized_event, - NULL); - if (err < 0) { - pr_err("Couldn't synthesize thread map.\n"); - return err; - } - - err = perf_event__synthesize_cpu_map(NULL, evsel_list->cpus, - process_synthesized_event, NULL); - if (err < 0) { - pr_err("Couldn't synthesize thread map.\n"); - return err; - } - - err = perf_event__synthesize_stat_config(NULL, &stat_config, - process_synthesized_event, NULL); - if (err < 0) { - pr_err("Couldn't synthesize config.\n"); - return err; - } - - return 0; -} - -#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) - -static int __store_counter_ids(struct perf_evsel *counter) -{ - int cpu, thread; - - for (cpu = 0; cpu < xyarray__max_x(counter->fd); cpu++) { - for (thread = 0; thread < xyarray__max_y(counter->fd); - thread++) { - int fd = FD(counter, cpu, thread); - - if (perf_evlist__id_add_fd(evsel_list, counter, - cpu, thread, fd) < 0) - return -1; - } - } - - return 0; -} - -static int store_counter_ids(struct perf_evsel *counter) -{ - struct cpu_map *cpus = counter->cpus; - struct thread_map *threads = counter->threads; - - if (perf_evsel__alloc_id(counter, cpus->nr, threads->nr)) - return -ENOMEM; - - return __store_counter_ids(counter); -} - static bool perf_evsel__should_store_id(struct perf_evsel *counter) { return STAT_RECORD || counter->attr.read_format & PERF_FORMAT_ID; } -static struct perf_evsel *perf_evsel__reset_weak_group(struct perf_evsel *evsel) +static bool is_target_alive(struct target *_target, + struct thread_map *threads) { - struct perf_evsel *c2, *leader; - bool is_open = true; + struct stat st; + int i; - leader = evsel->leader; - pr_debug("Weak group for %s/%d failed\n", - leader->name, leader->nr_members); + if (!target__has_task(_target)) + return true; - /* - * for_each_group_member doesn't work here because it doesn't - * include the first entry. - */ - evlist__for_each_entry(evsel_list, c2) { - if (c2 == evsel) - is_open = false; - if (c2->leader == leader) { - if (is_open) - perf_evsel__close(c2); - c2->leader = c2; - c2->nr_members = 0; - } + for (i = 0; i < threads->nr; i++) { + char path[PATH_MAX]; + + scnprintf(path, PATH_MAX, "%s/%d", procfs__mountpoint(), + threads->map[i].pid); + + if (!stat(path, &st)) + return true; } - return leader; + + return false; } static int __run_perf_stat(int argc, const char **argv, int run_idx) @@ -609,13 +445,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) evlist__for_each_entry(evsel_list, counter) { try_again: - if (create_perf_stat_counter(counter) < 0) { + if (create_perf_stat_counter(counter, &stat_config, &target) < 0) { /* Weak group failed. Reset the group. */ if ((errno == EINVAL || errno == EBADF) && counter->leader != counter && counter->weak_group) { - counter = perf_evsel__reset_weak_group(counter); + counter = perf_evlist__reset_weak_group(evsel_list, counter); goto try_again; } @@ -664,11 +500,11 @@ try_again: counter->supported = true; l = strlen(counter->unit); - if (l > unit_width) - unit_width = l; + if (l > stat_config.unit_width) + stat_config.unit_width = l; if (perf_evsel__should_store_id(counter) && - store_counter_ids(counter)) + perf_evsel__store_ids(counter, evsel_list)) return -1; } @@ -699,7 +535,8 @@ try_again: if (err < 0) return err; - err = perf_stat_synthesize_config(is_pipe); + err = perf_stat_synthesize_config(&stat_config, NULL, evsel_list, + process_synthesized_event, is_pipe); if (err < 0) return err; } @@ -724,7 +561,7 @@ try_again: break; } } - wait4(child_pid, &status, 0, &ru_data); + wait4(child_pid, &status, 0, &stat_config.ru_data); if (workload_exec_errno) { const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); @@ -738,6 +575,8 @@ try_again: enable_counters(); while (!done) { nanosleep(&ts, NULL); + if (!is_target_alive(&target, evsel_list->threads)) + break; if (timeout) break; if (interval) { @@ -752,8 +591,8 @@ try_again: t1 = rdclock(); - if (walltime_run_table) - walltime_run[run_idx] = t1 - t0; + if (stat_config.walltime_run_table) + stat_config.walltime_run[run_idx] = t1 - t0; update_stats(&walltime_nsecs_stats, t1 - t0); @@ -795,1105 +634,14 @@ static int run_perf_stat(int argc, const char **argv, int run_idx) return ret; } -static void print_running(u64 run, u64 ena) -{ - if (csv_output) { - fprintf(stat_config.output, "%s%" PRIu64 "%s%.2f", - csv_sep, - run, - csv_sep, - ena ? 100.0 * run / ena : 100.0); - } else if (run != ena) { - fprintf(stat_config.output, " (%.2f%%)", 100.0 * run / ena); - } -} - -static void print_noise_pct(double total, double avg) -{ - double pct = rel_stddev_stats(total, avg); - - if (csv_output) - fprintf(stat_config.output, "%s%.2f%%", csv_sep, pct); - else if (pct) - fprintf(stat_config.output, " ( +-%6.2f%% )", pct); -} - -static void print_noise(struct perf_evsel *evsel, double avg) -{ - struct perf_stat_evsel *ps; - - if (run_count == 1) - return; - - ps = evsel->stats; - print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); -} - -static void aggr_printout(struct perf_evsel *evsel, int id, int nr) -{ - switch (stat_config.aggr_mode) { - case AGGR_CORE: - fprintf(stat_config.output, "S%d-C%*d%s%*d%s", - cpu_map__id_to_socket(id), - csv_output ? 0 : -8, - cpu_map__id_to_cpu(id), - csv_sep, - csv_output ? 0 : 4, - nr, - csv_sep); - break; - case AGGR_SOCKET: - fprintf(stat_config.output, "S%*d%s%*d%s", - csv_output ? 0 : -5, - id, - csv_sep, - csv_output ? 0 : 4, - nr, - csv_sep); - break; - case AGGR_NONE: - fprintf(stat_config.output, "CPU%*d%s", - csv_output ? 0 : -4, - perf_evsel__cpus(evsel)->map[id], csv_sep); - break; - case AGGR_THREAD: - fprintf(stat_config.output, "%*s-%*d%s", - csv_output ? 0 : 16, - thread_map__comm(evsel->threads, id), - csv_output ? 0 : -8, - thread_map__pid(evsel->threads, id), - csv_sep); - break; - case AGGR_GLOBAL: - case AGGR_UNSET: - default: - break; - } -} - -struct outstate { - FILE *fh; - bool newline; - const char *prefix; - int nfields; - int id, nr; - struct perf_evsel *evsel; -}; - -#define METRIC_LEN 35 - -static void new_line_std(void *ctx) -{ - struct outstate *os = ctx; - - os->newline = true; -} - -static void do_new_line_std(struct outstate *os) -{ - fputc('\n', os->fh); - fputs(os->prefix, os->fh); - aggr_printout(os->evsel, os->id, os->nr); - if (stat_config.aggr_mode == AGGR_NONE) - fprintf(os->fh, " "); - fprintf(os->fh, " "); -} - -static void print_metric_std(void *ctx, const char *color, const char *fmt, - const char *unit, double val) -{ - struct outstate *os = ctx; - FILE *out = os->fh; - int n; - bool newline = os->newline; - - os->newline = false; - - if (unit == NULL || fmt == NULL) { - fprintf(out, "%-*s", METRIC_LEN, ""); - return; - } - - if (newline) - do_new_line_std(os); - - n = fprintf(out, " # "); - if (color) - n += color_fprintf(out, color, fmt, val); - else - n += fprintf(out, fmt, val); - fprintf(out, " %-*s", METRIC_LEN - n - 1, unit); -} - -static void new_line_csv(void *ctx) -{ - struct outstate *os = ctx; - int i; - - fputc('\n', os->fh); - if (os->prefix) - fprintf(os->fh, "%s%s", os->prefix, csv_sep); - aggr_printout(os->evsel, os->id, os->nr); - for (i = 0; i < os->nfields; i++) - fputs(csv_sep, os->fh); -} - -static void print_metric_csv(void *ctx, - const char *color __maybe_unused, - const char *fmt, const char *unit, double val) -{ - struct outstate *os = ctx; - FILE *out = os->fh; - char buf[64], *vals, *ends; - - if (unit == NULL || fmt == NULL) { - fprintf(out, "%s%s", csv_sep, csv_sep); - return; - } - snprintf(buf, sizeof(buf), fmt, val); - ends = vals = ltrim(buf); - while (isdigit(*ends) || *ends == '.') - ends++; - *ends = 0; - while (isspace(*unit)) - unit++; - fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit); -} - -/* Filter out some columns that don't work well in metrics only mode */ - -static bool valid_only_metric(const char *unit) -{ - if (!unit) - return false; - if (strstr(unit, "/sec") || - strstr(unit, "hz") || - strstr(unit, "Hz") || - strstr(unit, "CPUs utilized")) - return false; - return true; -} - -static const char *fixunit(char *buf, struct perf_evsel *evsel, - const char *unit) -{ - if (!strncmp(unit, "of all", 6)) { - snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel), - unit); - return buf; - } - return unit; -} - -static void print_metric_only(void *ctx, const char *color, const char *fmt, - const char *unit, double val) -{ - struct outstate *os = ctx; - FILE *out = os->fh; - char buf[1024], str[1024]; - unsigned mlen = metric_only_len; - - if (!valid_only_metric(unit)) - return; - unit = fixunit(buf, os->evsel, unit); - if (mlen < strlen(unit)) - mlen = strlen(unit) + 1; - - if (color) - mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1; - - color_snprintf(str, sizeof(str), color ?: "", fmt, val); - fprintf(out, "%*s ", mlen, str); -} - -static void print_metric_only_csv(void *ctx, const char *color __maybe_unused, - const char *fmt, - const char *unit, double val) -{ - struct outstate *os = ctx; - FILE *out = os->fh; - char buf[64], *vals, *ends; - char tbuf[1024]; - - if (!valid_only_metric(unit)) - return; - unit = fixunit(tbuf, os->evsel, unit); - snprintf(buf, sizeof buf, fmt, val); - ends = vals = ltrim(buf); - while (isdigit(*ends) || *ends == '.') - ends++; - *ends = 0; - fprintf(out, "%s%s", vals, csv_sep); -} - -static void new_line_metric(void *ctx __maybe_unused) -{ -} - -static void print_metric_header(void *ctx, const char *color __maybe_unused, - const char *fmt __maybe_unused, - const char *unit, double val __maybe_unused) -{ - struct outstate *os = ctx; - char tbuf[1024]; - - if (!valid_only_metric(unit)) - return; - unit = fixunit(tbuf, os->evsel, unit); - if (csv_output) - fprintf(os->fh, "%s%s", unit, csv_sep); - else - fprintf(os->fh, "%*s ", metric_only_len, unit); -} - -static int first_shadow_cpu(struct perf_evsel *evsel, int id) -{ - int i; - - if (!aggr_get_id) - return 0; - - if (stat_config.aggr_mode == AGGR_NONE) - return id; - - if (stat_config.aggr_mode == AGGR_GLOBAL) - return 0; - - for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) { - int cpu2 = perf_evsel__cpus(evsel)->map[i]; - - if (aggr_get_id(evsel_list->cpus, cpu2) == id) - return cpu2; - } - return 0; -} - -static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) -{ - FILE *output = stat_config.output; - double sc = evsel->scale; - const char *fmt; - - if (csv_output) { - fmt = floor(sc) != sc ? "%.2f%s" : "%.0f%s"; - } else { - if (big_num) - fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s"; - else - fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s"; - } - - aggr_printout(evsel, id, nr); - - fprintf(output, fmt, avg, csv_sep); - - if (evsel->unit) - fprintf(output, "%-*s%s", - csv_output ? 0 : unit_width, - evsel->unit, csv_sep); - - fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); - - if (evsel->cgrp) - fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); -} - -static bool is_mixed_hw_group(struct perf_evsel *counter) -{ - struct perf_evlist *evlist = counter->evlist; - u32 pmu_type = counter->attr.type; - struct perf_evsel *pos; - - if (counter->nr_members < 2) - return false; - - evlist__for_each_entry(evlist, pos) { - /* software events can be part of any hardware group */ - if (pos->attr.type == PERF_TYPE_SOFTWARE) - continue; - if (pmu_type == PERF_TYPE_SOFTWARE) { - pmu_type = pos->attr.type; - continue; - } - if (pmu_type != pos->attr.type) - return true; - } - - return false; -} - -static void printout(int id, int nr, struct perf_evsel *counter, double uval, - char *prefix, u64 run, u64 ena, double noise, - struct runtime_stat *st) -{ - struct perf_stat_output_ctx out; - struct outstate os = { - .fh = stat_config.output, - .prefix = prefix ? prefix : "", - .id = id, - .nr = nr, - .evsel = counter, - }; - print_metric_t pm = print_metric_std; - void (*nl)(void *); - - if (metric_only) { - nl = new_line_metric; - if (csv_output) - pm = print_metric_only_csv; - else - pm = print_metric_only; - } else - nl = new_line_std; - - if (csv_output && !metric_only) { - static int aggr_fields[] = { - [AGGR_GLOBAL] = 0, - [AGGR_THREAD] = 1, - [AGGR_NONE] = 1, - [AGGR_SOCKET] = 2, - [AGGR_CORE] = 2, - }; - - pm = print_metric_csv; - nl = new_line_csv; - os.nfields = 3; - os.nfields += aggr_fields[stat_config.aggr_mode]; - if (counter->cgrp) - os.nfields++; - } - if (run == 0 || ena == 0 || counter->counts->scaled == -1) { - if (metric_only) { - pm(&os, NULL, "", "", 0); - return; - } - aggr_printout(counter, id, nr); - - fprintf(stat_config.output, "%*s%s", - csv_output ? 0 : 18, - counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, - csv_sep); - - if (counter->supported) { - print_free_counters_hint = 1; - if (is_mixed_hw_group(counter)) - print_mixed_hw_group_error = 1; - } - - fprintf(stat_config.output, "%-*s%s", - csv_output ? 0 : unit_width, - counter->unit, csv_sep); - - fprintf(stat_config.output, "%*s", - csv_output ? 0 : -25, - perf_evsel__name(counter)); - - if (counter->cgrp) - fprintf(stat_config.output, "%s%s", - csv_sep, counter->cgrp->name); - - if (!csv_output) - pm(&os, NULL, NULL, "", 0); - print_noise(counter, noise); - print_running(run, ena); - if (csv_output) - pm(&os, NULL, NULL, "", 0); - return; - } - - if (!metric_only) - abs_printout(id, nr, counter, uval); - - out.print_metric = pm; - out.new_line = nl; - out.ctx = &os; - out.force_header = false; - - if (csv_output && !metric_only) { - print_noise(counter, noise); - print_running(run, ena); - } - - perf_stat__print_shadow_stats(counter, uval, - first_shadow_cpu(counter, id), - &out, &metric_events, st); - if (!csv_output && !metric_only) { - print_noise(counter, noise); - print_running(run, ena); - } -} - -static void aggr_update_shadow(void) -{ - int cpu, s2, id, s; - u64 val; - struct perf_evsel *counter; - - for (s = 0; s < aggr_map->nr; s++) { - id = aggr_map->map[s]; - evlist__for_each_entry(evsel_list, counter) { - val = 0; - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { - s2 = aggr_get_id(evsel_list->cpus, cpu); - if (s2 != id) - continue; - val += perf_counts(counter->counts, cpu, 0)->val; - } - perf_stat__update_shadow_stats(counter, val, - first_shadow_cpu(counter, id), - &rt_stat); - } - } -} - -static void uniquify_event_name(struct perf_evsel *counter) -{ - char *new_name; - char *config; - - if (counter->uniquified_name || - !counter->pmu_name || !strncmp(counter->name, counter->pmu_name, - strlen(counter->pmu_name))) - return; - - config = strchr(counter->name, '/'); - if (config) { - if (asprintf(&new_name, - "%s%s", counter->pmu_name, config) > 0) { - free(counter->name); - counter->name = new_name; - } - } else { - if (asprintf(&new_name, - "%s [%s]", counter->name, counter->pmu_name) > 0) { - free(counter->name); - counter->name = new_name; - } - } - - counter->uniquified_name = true; -} - -static void collect_all_aliases(struct perf_evsel *counter, - void (*cb)(struct perf_evsel *counter, void *data, - bool first), - void *data) -{ - struct perf_evsel *alias; - - alias = list_prepare_entry(counter, &(evsel_list->entries), node); - list_for_each_entry_continue (alias, &evsel_list->entries, node) { - if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) || - alias->scale != counter->scale || - alias->cgrp != counter->cgrp || - strcmp(alias->unit, counter->unit) || - perf_evsel__is_clock(alias) != perf_evsel__is_clock(counter)) - break; - alias->merged_stat = true; - cb(alias, data, false); - } -} - -static bool collect_data(struct perf_evsel *counter, - void (*cb)(struct perf_evsel *counter, void *data, - bool first), - void *data) -{ - if (counter->merged_stat) - return false; - cb(counter, data, true); - if (no_merge) - uniquify_event_name(counter); - else if (counter->auto_merge_stats) - collect_all_aliases(counter, cb, data); - return true; -} - -struct aggr_data { - u64 ena, run, val; - int id; - int nr; - int cpu; -}; - -static void aggr_cb(struct perf_evsel *counter, void *data, bool first) -{ - struct aggr_data *ad = data; - int cpu, s2; - - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { - struct perf_counts_values *counts; - - s2 = aggr_get_id(perf_evsel__cpus(counter), cpu); - if (s2 != ad->id) - continue; - if (first) - ad->nr++; - counts = perf_counts(counter->counts, cpu, 0); - /* - * When any result is bad, make them all to give - * consistent output in interval mode. - */ - if (counts->ena == 0 || counts->run == 0 || - counter->counts->scaled == -1) { - ad->ena = 0; - ad->run = 0; - break; - } - ad->val += counts->val; - ad->ena += counts->ena; - ad->run += counts->run; - } -} - -static void print_aggr(char *prefix) -{ - FILE *output = stat_config.output; - struct perf_evsel *counter; - int s, id, nr; - double uval; - u64 ena, run, val; - bool first; - - if (!(aggr_map || aggr_get_id)) - return; - - aggr_update_shadow(); - - /* - * With metric_only everything is on a single line. - * Without each counter has its own line. - */ - for (s = 0; s < aggr_map->nr; s++) { - struct aggr_data ad; - if (prefix && metric_only) - fprintf(output, "%s", prefix); - - ad.id = id = aggr_map->map[s]; - first = true; - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - - ad.val = ad.ena = ad.run = 0; - ad.nr = 0; - if (!collect_data(counter, aggr_cb, &ad)) - continue; - nr = ad.nr; - ena = ad.ena; - run = ad.run; - val = ad.val; - if (first && metric_only) { - first = false; - aggr_printout(counter, id, nr); - } - if (prefix && !metric_only) - fprintf(output, "%s", prefix); - - uval = val * counter->scale; - printout(id, nr, counter, uval, prefix, run, ena, 1.0, - &rt_stat); - if (!metric_only) - fputc('\n', output); - } - if (metric_only) - fputc('\n', output); - } -} - -static int cmp_val(const void *a, const void *b) -{ - return ((struct perf_aggr_thread_value *)b)->val - - ((struct perf_aggr_thread_value *)a)->val; -} - -static struct perf_aggr_thread_value *sort_aggr_thread( - struct perf_evsel *counter, - int nthreads, int ncpus, - int *ret) -{ - int cpu, thread, i = 0; - double uval; - struct perf_aggr_thread_value *buf; - - buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value)); - if (!buf) - return NULL; - - for (thread = 0; thread < nthreads; thread++) { - u64 ena = 0, run = 0, val = 0; - - for (cpu = 0; cpu < ncpus; cpu++) { - val += perf_counts(counter->counts, cpu, thread)->val; - ena += perf_counts(counter->counts, cpu, thread)->ena; - run += perf_counts(counter->counts, cpu, thread)->run; - } - - uval = val * counter->scale; - - /* - * Skip value 0 when enabling --per-thread globally, - * otherwise too many 0 output. - */ - if (uval == 0.0 && target__has_per_thread(&target)) - continue; - - buf[i].counter = counter; - buf[i].id = thread; - buf[i].uval = uval; - buf[i].val = val; - buf[i].run = run; - buf[i].ena = ena; - i++; - } - - qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val); - - if (ret) - *ret = i; - - return buf; -} - -static void print_aggr_thread(struct perf_evsel *counter, char *prefix) -{ - FILE *output = stat_config.output; - int nthreads = thread_map__nr(counter->threads); - int ncpus = cpu_map__nr(counter->cpus); - int thread, sorted_threads, id; - struct perf_aggr_thread_value *buf; - - buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads); - if (!buf) { - perror("cannot sort aggr thread"); - return; - } - - for (thread = 0; thread < sorted_threads; thread++) { - if (prefix) - fprintf(output, "%s", prefix); - - id = buf[thread].id; - if (stat_config.stats) - printout(id, 0, buf[thread].counter, buf[thread].uval, - prefix, buf[thread].run, buf[thread].ena, 1.0, - &stat_config.stats[id]); - else - printout(id, 0, buf[thread].counter, buf[thread].uval, - prefix, buf[thread].run, buf[thread].ena, 1.0, - &rt_stat); - fputc('\n', output); - } - - free(buf); -} - -struct caggr_data { - double avg, avg_enabled, avg_running; -}; - -static void counter_aggr_cb(struct perf_evsel *counter, void *data, - bool first __maybe_unused) -{ - struct caggr_data *cd = data; - struct perf_stat_evsel *ps = counter->stats; - - cd->avg += avg_stats(&ps->res_stats[0]); - cd->avg_enabled += avg_stats(&ps->res_stats[1]); - cd->avg_running += avg_stats(&ps->res_stats[2]); -} - -/* - * Print out the results of a single counter: - * aggregated counts in system-wide mode - */ -static void print_counter_aggr(struct perf_evsel *counter, char *prefix) -{ - FILE *output = stat_config.output; - double uval; - struct caggr_data cd = { .avg = 0.0 }; - - if (!collect_data(counter, counter_aggr_cb, &cd)) - return; - - if (prefix && !metric_only) - fprintf(output, "%s", prefix); - - uval = cd.avg * counter->scale; - printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, - cd.avg, &rt_stat); - if (!metric_only) - fprintf(output, "\n"); -} - -static void counter_cb(struct perf_evsel *counter, void *data, - bool first __maybe_unused) -{ - struct aggr_data *ad = data; - - ad->val += perf_counts(counter->counts, ad->cpu, 0)->val; - ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena; - ad->run += perf_counts(counter->counts, ad->cpu, 0)->run; -} - -/* - * Print out the results of a single counter: - * does not use aggregated count in system-wide - */ -static void print_counter(struct perf_evsel *counter, char *prefix) -{ - FILE *output = stat_config.output; - u64 ena, run, val; - double uval; - int cpu; - - for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { - struct aggr_data ad = { .cpu = cpu }; - - if (!collect_data(counter, counter_cb, &ad)) - return; - val = ad.val; - ena = ad.ena; - run = ad.run; - - if (prefix) - fprintf(output, "%s", prefix); - - uval = val * counter->scale; - printout(cpu, 0, counter, uval, prefix, run, ena, 1.0, - &rt_stat); - - fputc('\n', output); - } -} - -static void print_no_aggr_metric(char *prefix) -{ - int cpu; - int nrcpus = 0; - struct perf_evsel *counter; - u64 ena, run, val; - double uval; - - nrcpus = evsel_list->cpus->nr; - for (cpu = 0; cpu < nrcpus; cpu++) { - bool first = true; - - if (prefix) - fputs(prefix, stat_config.output); - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - if (first) { - aggr_printout(counter, cpu, 0); - first = false; - } - val = perf_counts(counter->counts, cpu, 0)->val; - ena = perf_counts(counter->counts, cpu, 0)->ena; - run = perf_counts(counter->counts, cpu, 0)->run; - - uval = val * counter->scale; - printout(cpu, 0, counter, uval, prefix, run, ena, 1.0, - &rt_stat); - } - fputc('\n', stat_config.output); - } -} - -static int aggr_header_lens[] = { - [AGGR_CORE] = 18, - [AGGR_SOCKET] = 12, - [AGGR_NONE] = 6, - [AGGR_THREAD] = 24, - [AGGR_GLOBAL] = 0, -}; - -static const char *aggr_header_csv[] = { - [AGGR_CORE] = "core,cpus,", - [AGGR_SOCKET] = "socket,cpus", - [AGGR_NONE] = "cpu,", - [AGGR_THREAD] = "comm-pid,", - [AGGR_GLOBAL] = "" -}; - -static void print_metric_headers(const char *prefix, bool no_indent) -{ - struct perf_stat_output_ctx out; - struct perf_evsel *counter; - struct outstate os = { - .fh = stat_config.output - }; - - if (prefix) - fprintf(stat_config.output, "%s", prefix); - - if (!csv_output && !no_indent) - fprintf(stat_config.output, "%*s", - aggr_header_lens[stat_config.aggr_mode], ""); - if (csv_output) { - if (stat_config.interval) - fputs("time,", stat_config.output); - fputs(aggr_header_csv[stat_config.aggr_mode], - stat_config.output); - } - - /* Print metrics headers only */ - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - os.evsel = counter; - out.ctx = &os; - out.print_metric = print_metric_header; - out.new_line = new_line_metric; - out.force_header = true; - os.evsel = counter; - perf_stat__print_shadow_stats(counter, 0, - 0, - &out, - &metric_events, - &rt_stat); - } - fputc('\n', stat_config.output); -} - -static void print_interval(char *prefix, struct timespec *ts) -{ - FILE *output = stat_config.output; - static int num_print_interval; - - if (interval_clear) - puts(CONSOLE_CLEAR); - - sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); - - if ((num_print_interval == 0 && !csv_output) || interval_clear) { - switch (stat_config.aggr_mode) { - case AGGR_SOCKET: - fprintf(output, "# time socket cpus"); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); - break; - case AGGR_CORE: - fprintf(output, "# time core cpus"); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); - break; - case AGGR_NONE: - fprintf(output, "# time CPU "); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); - break; - case AGGR_THREAD: - fprintf(output, "# time comm-pid"); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); - break; - case AGGR_GLOBAL: - default: - fprintf(output, "# time"); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); - case AGGR_UNSET: - break; - } - } - - if ((num_print_interval == 0 || interval_clear) && metric_only) - print_metric_headers(" ", true); - if (++num_print_interval == 25) - num_print_interval = 0; -} - -static void print_header(int argc, const char **argv) -{ - FILE *output = stat_config.output; - int i; - - fflush(stdout); - - if (!csv_output) { - fprintf(output, "\n"); - fprintf(output, " Performance counter stats for "); - if (target.system_wide) - fprintf(output, "\'system wide"); - else if (target.cpu_list) - fprintf(output, "\'CPU(s) %s", target.cpu_list); - else if (!target__has_task(&target)) { - fprintf(output, "\'%s", argv ? argv[0] : "pipe"); - for (i = 1; argv && (i < argc); i++) - fprintf(output, " %s", argv[i]); - } else if (target.pid) - fprintf(output, "process id \'%s", target.pid); - else - fprintf(output, "thread id \'%s", target.tid); - - fprintf(output, "\'"); - if (run_count > 1) - fprintf(output, " (%d runs)", run_count); - fprintf(output, ":\n\n"); - } -} - -static int get_precision(double num) -{ - if (num > 1) - return 0; - - return lround(ceil(-log10(num))); -} - -static void print_table(FILE *output, int precision, double avg) -{ - char tmp[64]; - int idx, indent = 0; - - scnprintf(tmp, 64, " %17.*f", precision, avg); - while (tmp[indent] == ' ') - indent++; - - fprintf(output, "%*s# Table of individual measurements:\n", indent, ""); - - for (idx = 0; idx < run_count; idx++) { - double run = (double) walltime_run[idx] / NSEC_PER_SEC; - int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5); - - fprintf(output, " %17.*f (%+.*f) ", - precision, run, precision, run - avg); - - for (h = 0; h < n; h++) - fprintf(output, "#"); - - fprintf(output, "\n"); - } - - fprintf(output, "\n%*s# Final result:\n", indent, ""); -} - -static double timeval2double(struct timeval *t) -{ - return t->tv_sec + (double) t->tv_usec/USEC_PER_SEC; -} - -static void print_footer(void) -{ - double avg = avg_stats(&walltime_nsecs_stats) / NSEC_PER_SEC; - FILE *output = stat_config.output; - int n; - - if (!null_run) - fprintf(output, "\n"); - - if (run_count == 1) { - fprintf(output, " %17.9f seconds time elapsed", avg); - - if (ru_display) { - double ru_utime = timeval2double(&ru_data.ru_utime); - double ru_stime = timeval2double(&ru_data.ru_stime); - - fprintf(output, "\n\n"); - fprintf(output, " %17.9f seconds user\n", ru_utime); - fprintf(output, " %17.9f seconds sys\n", ru_stime); - } - } else { - double sd = stddev_stats(&walltime_nsecs_stats) / NSEC_PER_SEC; - /* - * Display at most 2 more significant - * digits than the stddev inaccuracy. - */ - int precision = get_precision(sd) + 2; - - if (walltime_run_table) - print_table(output, precision, avg); - - fprintf(output, " %17.*f +- %.*f seconds time elapsed", - precision, avg, precision, sd); - - print_noise_pct(sd, avg); - } - fprintf(output, "\n\n"); - - if (print_free_counters_hint && - sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 && - n > 0) - fprintf(output, -"Some events weren't counted. Try disabling the NMI watchdog:\n" -" echo 0 > /proc/sys/kernel/nmi_watchdog\n" -" perf stat ...\n" -" echo 1 > /proc/sys/kernel/nmi_watchdog\n"); - - if (print_mixed_hw_group_error) - fprintf(output, - "The events in group usually have to be from " - "the same PMU. Try reorganizing the group.\n"); -} - static void print_counters(struct timespec *ts, int argc, const char **argv) { - int interval = stat_config.interval; - struct perf_evsel *counter; - char buf[64], *prefix = NULL; - /* Do not print anything if we record to the pipe. */ if (STAT_RECORD && perf_stat.data.is_pipe) return; - if (interval) - print_interval(prefix = buf, ts); - else - print_header(argc, argv); - - if (metric_only) { - static int num_print_iv; - - if (num_print_iv == 0 && !interval) - print_metric_headers(prefix, false); - if (num_print_iv++ == 25) - num_print_iv = 0; - if (stat_config.aggr_mode == AGGR_GLOBAL && prefix) - fprintf(stat_config.output, "%s", prefix); - } - - switch (stat_config.aggr_mode) { - case AGGR_CORE: - case AGGR_SOCKET: - print_aggr(prefix); - break; - case AGGR_THREAD: - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - print_aggr_thread(counter, prefix); - } - break; - case AGGR_GLOBAL: - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - print_counter_aggr(counter, prefix); - } - if (metric_only) - fputc('\n', stat_config.output); - break; - case AGGR_NONE: - if (metric_only) - print_no_aggr_metric(prefix); - else { - evlist__for_each_entry(evsel_list, counter) { - if (is_duration_time(counter)) - continue; - print_counter(counter, prefix); - } - } - break; - case AGGR_UNSET: - default: - break; - } - - if (!interval && !csv_output) - print_footer(); - - fflush(stat_config.output); + perf_evlist__print_counters(evsel_list, &stat_config, &target, + ts, argc, argv); } static volatile int signr = -1; @@ -1950,7 +698,7 @@ static int enable_metric_only(const struct option *opt __maybe_unused, const char *s __maybe_unused, int unset) { force_metric_only = true; - metric_only = !unset; + stat_config.metric_only = !unset; return 0; } @@ -1958,7 +706,7 @@ static int parse_metric_groups(const struct option *opt, const char *str, int unset __maybe_unused) { - return metricgroup__parse_groups(opt, str, &metric_events); + return metricgroup__parse_groups(opt, str, &stat_config.metric_events); } static const struct option stat_options[] = { @@ -1969,7 +717,7 @@ static const struct option stat_options[] = { parse_events_option), OPT_CALLBACK(0, "filter", &evsel_list, "filter", "event filter", parse_filter), - OPT_BOOLEAN('i', "no-inherit", &no_inherit, + OPT_BOOLEAN('i', "no-inherit", &stat_config.no_inherit, "child tasks do not inherit counters"), OPT_STRING('p', "pid", &target.pid, "pid", "stat events on existing process id"), @@ -1982,11 +730,11 @@ static const struct option stat_options[] = { OPT_BOOLEAN('c', "scale", &stat_config.scale, "scale/normalize counters"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), - OPT_INTEGER('r', "repeat", &run_count, + OPT_INTEGER('r', "repeat", &stat_config.run_count, "repeat command and print average + stddev (max: 100, forever: 0)"), - OPT_BOOLEAN(0, "table", &walltime_run_table, + OPT_BOOLEAN(0, "table", &stat_config.walltime_run_table, "display details about each run (only with -r option)"), - OPT_BOOLEAN('n', "null", &null_run, + OPT_BOOLEAN('n', "null", &stat_config.null_run, "null run - dont start any counters"), OPT_INCR('d', "detailed", &detailed_run, "detailed run - start a lot of events"), @@ -1999,8 +747,8 @@ static const struct option stat_options[] = { "list of cpus to monitor in system-wide"), OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode, "disable CPU count aggregation", AGGR_NONE), - OPT_BOOLEAN(0, "no-merge", &no_merge, "Do not merge identical named events"), - OPT_STRING('x', "field-separator", &csv_sep, "separator", + OPT_BOOLEAN(0, "no-merge", &stat_config.no_merge, "Do not merge identical named events"), + OPT_STRING('x', "field-separator", &stat_config.csv_sep, "separator", "print counts with custom separator"), OPT_CALLBACK('G', "cgroup", &evsel_list, "name", "monitor event in cgroup name only", parse_cgroups), @@ -2017,7 +765,7 @@ static const struct option stat_options[] = { "(overhead is possible for values <= 100ms)"), OPT_INTEGER(0, "interval-count", &stat_config.times, "print counts for fixed number of times"), - OPT_BOOLEAN(0, "interval-clear", &interval_clear, + OPT_BOOLEAN(0, "interval-clear", &stat_config.interval_clear, "clear screen in between new interval"), OPT_UINTEGER(0, "timeout", &stat_config.timeout, "stop workload and print counts after a timeout period in ms (>= 10ms)"), @@ -2027,9 +775,9 @@ static const struct option stat_options[] = { "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, "aggregate counts per thread", AGGR_THREAD), - OPT_UINTEGER('D', "delay", &initial_delay, + OPT_UINTEGER('D', "delay", &stat_config.initial_delay, "ms to wait before starting measurement after program start"), - OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL, + OPT_CALLBACK_NOOPT(0, "metric-only", &stat_config.metric_only, NULL, "Only print computed metrics. No raw values", enable_metric_only), OPT_BOOLEAN(0, "topdown", &topdown_run, "measure topdown level 1 statistics"), @@ -2041,12 +789,14 @@ static const struct option stat_options[] = { OPT_END() }; -static int perf_stat__get_socket(struct cpu_map *map, int cpu) +static int perf_stat__get_socket(struct perf_stat_config *config __maybe_unused, + struct cpu_map *map, int cpu) { return cpu_map__get_socket(map, cpu, NULL); } -static int perf_stat__get_core(struct cpu_map *map, int cpu) +static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused, + struct cpu_map *map, int cpu) { return cpu_map__get_core(map, cpu, NULL); } @@ -2063,9 +813,8 @@ static int cpu_map__get_max(struct cpu_map *map) return max; } -static struct cpu_map *cpus_aggr_map; - -static int perf_stat__get_aggr(aggr_get_id_t get_id, struct cpu_map *map, int idx) +static int perf_stat__get_aggr(struct perf_stat_config *config, + aggr_get_id_t get_id, struct cpu_map *map, int idx) { int cpu; @@ -2074,20 +823,22 @@ static int perf_stat__get_aggr(aggr_get_id_t get_id, struct cpu_map *map, int id cpu = map->map[idx]; - if (cpus_aggr_map->map[cpu] == -1) - cpus_aggr_map->map[cpu] = get_id(map, idx); + if (config->cpus_aggr_map->map[cpu] == -1) + config->cpus_aggr_map->map[cpu] = get_id(config, map, idx); - return cpus_aggr_map->map[cpu]; + return config->cpus_aggr_map->map[cpu]; } -static int perf_stat__get_socket_cached(struct cpu_map *map, int idx) +static int perf_stat__get_socket_cached(struct perf_stat_config *config, + struct cpu_map *map, int idx) { - return perf_stat__get_aggr(perf_stat__get_socket, map, idx); + return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx); } -static int perf_stat__get_core_cached(struct cpu_map *map, int idx) +static int perf_stat__get_core_cached(struct perf_stat_config *config, + struct cpu_map *map, int idx) { - return perf_stat__get_aggr(perf_stat__get_core, map, idx); + return perf_stat__get_aggr(config, perf_stat__get_core, map, idx); } static int perf_stat_init_aggr_mode(void) @@ -2096,18 +847,18 @@ static int perf_stat_init_aggr_mode(void) switch (stat_config.aggr_mode) { case AGGR_SOCKET: - if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { + if (cpu_map__build_socket_map(evsel_list->cpus, &stat_config.aggr_map)) { perror("cannot build socket map"); return -1; } - aggr_get_id = perf_stat__get_socket_cached; + stat_config.aggr_get_id = perf_stat__get_socket_cached; break; case AGGR_CORE: - if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { + if (cpu_map__build_core_map(evsel_list->cpus, &stat_config.aggr_map)) { perror("cannot build core map"); return -1; } - aggr_get_id = perf_stat__get_core_cached; + stat_config.aggr_get_id = perf_stat__get_core_cached; break; case AGGR_NONE: case AGGR_GLOBAL: @@ -2123,16 +874,16 @@ static int perf_stat_init_aggr_mode(void) * the aggregation translate cpumap. */ nr = cpu_map__get_max(evsel_list->cpus); - cpus_aggr_map = cpu_map__empty_new(nr + 1); - return cpus_aggr_map ? 0 : -ENOMEM; + stat_config.cpus_aggr_map = cpu_map__empty_new(nr + 1); + return stat_config.cpus_aggr_map ? 0 : -ENOMEM; } static void perf_stat__exit_aggr_mode(void) { - cpu_map__put(aggr_map); - cpu_map__put(cpus_aggr_map); - aggr_map = NULL; - cpus_aggr_map = NULL; + cpu_map__put(stat_config.aggr_map); + cpu_map__put(stat_config.cpus_aggr_map); + stat_config.aggr_map = NULL; + stat_config.cpus_aggr_map = NULL; } static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, int idx) @@ -2190,12 +941,14 @@ static int perf_env__build_core_map(struct perf_env *env, struct cpu_map *cpus, return cpu_map__build_map(cpus, corep, perf_env__get_core, env); } -static int perf_stat__get_socket_file(struct cpu_map *map, int idx) +static int perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused, + struct cpu_map *map, int idx) { return perf_env__get_socket(map, idx, &perf_stat.session->header.env); } -static int perf_stat__get_core_file(struct cpu_map *map, int idx) +static int perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused, + struct cpu_map *map, int idx) { return perf_env__get_core(map, idx, &perf_stat.session->header.env); } @@ -2206,18 +959,18 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) switch (stat_config.aggr_mode) { case AGGR_SOCKET: - if (perf_env__build_socket_map(env, evsel_list->cpus, &aggr_map)) { + if (perf_env__build_socket_map(env, evsel_list->cpus, &stat_config.aggr_map)) { perror("cannot build socket map"); return -1; } - aggr_get_id = perf_stat__get_socket_file; + stat_config.aggr_get_id = perf_stat__get_socket_file; break; case AGGR_CORE: - if (perf_env__build_core_map(env, evsel_list->cpus, &aggr_map)) { + if (perf_env__build_core_map(env, evsel_list->cpus, &stat_config.aggr_map)) { perror("cannot build core map"); return -1; } - aggr_get_id = perf_stat__get_core_file; + stat_config.aggr_get_id = perf_stat__get_core_file; break; case AGGR_NONE: case AGGR_GLOBAL: @@ -2401,7 +1154,7 @@ static int add_default_attributes(void) struct parse_events_error errinfo; /* Set attrs if no event is selected and !null_run: */ - if (null_run) + if (stat_config.null_run) return 0; if (transaction_run) { @@ -2414,7 +1167,7 @@ static int add_default_attributes(void) struct option opt = { .value = &evsel_list }; return metricgroup__parse_groups(&opt, "transaction", - &metric_events); + &stat_config.metric_events); } if (pmu_have_event("cpu", "cycles-ct") && @@ -2452,7 +1205,7 @@ static int add_default_attributes(void) if (pmu_have_event("msr", "aperf") && pmu_have_event("msr", "smi")) { if (!force_metric_only) - metric_only = true; + stat_config.metric_only = true; err = parse_events(evsel_list, smi_cost_attrs, &errinfo); } else { fprintf(stderr, "To measure SMI cost, it needs " @@ -2483,7 +1236,7 @@ static int add_default_attributes(void) } if (!force_metric_only) - metric_only = true; + stat_config.metric_only = true; if (topdown_filter_events(topdown_attrs, &str, arch_topdown_check_group(&warn)) < 0) { pr_err("Out of memory\n"); @@ -2580,7 +1333,7 @@ static int __cmd_record(int argc, const char **argv) if (output_name) data->file.path = output_name; - if (run_count != 1 || forever) { + if (stat_config.run_count != 1 || forever) { pr_err("Cannot use -r option with perf stat record.\n"); return -1; } @@ -2599,9 +1352,8 @@ static int __cmd_record(int argc, const char **argv) return argc; } -static int process_stat_round_event(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +static int process_stat_round_event(struct perf_session *session, + union perf_event *event) { struct stat_round_event *stat_round = &event->stat_round; struct perf_evsel *counter; @@ -2626,10 +1378,10 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused, } static -int process_stat_config_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +int process_stat_config_event(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); perf_event__read_stat_config(&stat_config, &event->stat_config); @@ -2669,10 +1421,10 @@ static int set_maps(struct perf_stat *st) } static -int process_thread_map_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +int process_thread_map_event(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); if (st->threads) { @@ -2688,10 +1440,10 @@ int process_thread_map_event(struct perf_tool *tool, } static -int process_cpu_map_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +int process_cpu_map_event(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); struct cpu_map *cpus; @@ -2853,12 +1605,12 @@ int cmd_stat(int argc, const char **argv) perf_stat__collect_metric_expr(evsel_list); perf_stat__init_shadow_stats(); - if (csv_sep) { - csv_output = true; - if (!strcmp(csv_sep, "\\t")) - csv_sep = "\t"; + if (stat_config.csv_sep) { + stat_config.csv_output = true; + if (!strcmp(stat_config.csv_sep, "\\t")) + stat_config.csv_sep = "\t"; } else - csv_sep = DEFAULT_SEPARATOR; + stat_config.csv_sep = DEFAULT_SEPARATOR; if (argc && !strncmp(argv[0], "rec", 3)) { argc = __cmd_record(argc, argv); @@ -2883,17 +1635,17 @@ int cmd_stat(int argc, const char **argv) goto out; } - if (metric_only && stat_config.aggr_mode == AGGR_THREAD) { + if (stat_config.metric_only && stat_config.aggr_mode == AGGR_THREAD) { fprintf(stderr, "--metric-only is not supported with --per-thread\n"); goto out; } - if (metric_only && run_count > 1) { + if (stat_config.metric_only && stat_config.run_count > 1) { fprintf(stderr, "--metric-only is not supported with -r\n"); goto out; } - if (walltime_run_table && run_count <= 1) { + if (stat_config.walltime_run_table && stat_config.run_count <= 1) { fprintf(stderr, "--table is only supported with -r\n"); parse_options_usage(stat_usage, stat_options, "r", 1); parse_options_usage(NULL, stat_options, "table", 0); @@ -2931,7 +1683,7 @@ int cmd_stat(int argc, const char **argv) /* * let the spreadsheet do the pretty-printing */ - if (csv_output) { + if (stat_config.csv_output) { /* User explicitly passed -B? */ if (big_num_opt == 1) { fprintf(stderr, "-B option not supported with -x\n"); @@ -2939,9 +1691,9 @@ int cmd_stat(int argc, const char **argv) parse_options_usage(NULL, stat_options, "x", 1); goto out; } else /* Nope, so disable big number formatting */ - big_num = false; + stat_config.big_num = false; } else if (big_num_opt == 0) /* User passed --no-big-num */ - big_num = false; + stat_config.big_num = false; setup_system_wide(argc); @@ -2949,21 +1701,21 @@ int cmd_stat(int argc, const char **argv) * Display user/system times only for single * run and when there's specified tracee. */ - if ((run_count == 1) && target__none(&target)) - ru_display = true; + if ((stat_config.run_count == 1) && target__none(&target)) + stat_config.ru_display = true; - if (run_count < 0) { + if (stat_config.run_count < 0) { pr_err("Run count must be a positive number\n"); parse_options_usage(stat_usage, stat_options, "r", 1); goto out; - } else if (run_count == 0) { + } else if (stat_config.run_count == 0) { forever = true; - run_count = 1; + stat_config.run_count = 1; } - if (walltime_run_table) { - walltime_run = zalloc(run_count * sizeof(walltime_run[0])); - if (!walltime_run) { + if (stat_config.walltime_run_table) { + stat_config.walltime_run = zalloc(stat_config.run_count * sizeof(stat_config.walltime_run[0])); + if (!stat_config.walltime_run) { pr_err("failed to setup -r option"); goto out; } @@ -3065,6 +1817,17 @@ int cmd_stat(int argc, const char **argv) if (perf_stat_init_aggr_mode()) goto out; + /* + * Set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless + * while avoiding that older tools show confusing messages. + * + * However for pipe sessions we need to keep it zero, + * because script's perf_evsel__check_attr is triggered + * by attr->sample_type != 0, and we can't run it on + * stat sessions. + */ + stat_config.identifier = !(STAT_RECORD && perf_stat.data.is_pipe); + /* * We dont want to block the signals - that would cause * child tasks to inherit that and Ctrl-C would not work. @@ -3079,8 +1842,8 @@ int cmd_stat(int argc, const char **argv) signal(SIGABRT, skip_signal); status = 0; - for (run_idx = 0; forever || run_idx < run_count; run_idx++) { - if (run_count != 1 && verbose > 0) + for (run_idx = 0; forever || run_idx < stat_config.run_count; run_idx++) { + if (stat_config.run_count != 1 && verbose > 0) fprintf(output, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); @@ -3132,7 +1895,7 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: - free(walltime_run); + free(stat_config.walltime_run); if (smi_cost && smi_reset) sysfs__write_int(FREEZE_ON_SMI_PATH, 0); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d21d8751e749..aa0c73e57924 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1134,11 +1134,6 @@ static int __cmd_top(struct perf_top *top) if (!target__none(&opts->target)) perf_evlist__enable(top->evlist); - /* Wait for a minimal set of events before starting the snapshot */ - perf_evlist__poll(top->evlist, 100); - - perf_top__mmap_read(top); - ret = -1; if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : display_thread), top)) { @@ -1156,6 +1151,11 @@ static int __cmd_top(struct perf_top *top) } } + /* Wait for a minimal set of events before starting the snapshot */ + perf_evlist__poll(top->evlist, 100); + + perf_top__mmap_read(top); + while (!done) { u64 hits = top->samples; @@ -1257,7 +1257,14 @@ int cmd_top(int argc, const char **argv) .uses_mmap = true, }, .proc_map_timeout = 500, - .overwrite = 1, + /* + * FIXME: This will lose PERF_RECORD_MMAP and other metadata + * when we pause, fix that and reenable. Probably using a + * separate evlist with a dummy event, i.e. a non-overwrite + * ring buffer just for metadata events, while PERF_RECORD_SAMPLE + * stays in overwrite mode. -acme + * */ + .overwrite = 0, }, .max_stack = sysctl__max_stack(), .annotation_opts = annotation__default_options, @@ -1372,6 +1379,8 @@ int cmd_top(int argc, const char **argv) "Show raw trace event output (do not use print fmt or plugins)"), OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy, "Show entries in a hierarchy"), + OPT_BOOLEAN(0, "overwrite", &top.record_opts.overwrite, + "Use a backward ring buffer, default: no"), OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"), OPT_UINTEGER(0, "num-thread-synthesize", &top.nr_threads_synthesize, "number of thread to run event synthesize"), @@ -1420,6 +1429,9 @@ int cmd_top(int argc, const char **argv) } } + if (opts->branch_stack && callchain_param.enabled) + symbol_conf.show_branchflag_count = true; + sort__mode = SORT_MODE__TOP; /* display thread wants entries to be collapsed in a different tree */ perf_hpp_list.need_collapse = 1; diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 22ab8e67c760..835619476370 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -89,6 +89,8 @@ struct trace { u64 base_time; FILE *output; unsigned long nr_events; + unsigned long nr_events_printed; + unsigned long max_events; struct strlist *ev_qualifier; struct { size_t nr; @@ -106,6 +108,7 @@ struct trace { } stats; unsigned int max_stack; unsigned int min_stack; + bool raw_augmented_syscalls; bool not_ev_qualifier; bool live; bool full_time; @@ -181,7 +184,7 @@ static int __tp_field__init_uint(struct tp_field *field, int size, int offset, b return 0; } -static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap) +static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap) { return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap); } @@ -198,7 +201,7 @@ static int __tp_field__init_ptr(struct tp_field *field, int offset) return 0; } -static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) +static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field) { return __tp_field__init_ptr(field, format_field->offset); } @@ -214,7 +217,7 @@ static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, struct tp_field *field, const char *name) { - struct format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = perf_evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -230,7 +233,7 @@ static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, struct tp_field *field, const char *name) { - struct format_field *format_field = perf_evsel__field(evsel, name); + struct tep_format_field *format_field = perf_evsel__field(evsel, name); if (format_field == NULL) return -1; @@ -288,6 +291,13 @@ static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel) return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); } +static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel) +{ + struct syscall_tp *sc = evsel->priv; + + return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap); +} + static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler) { evsel->priv = malloc(sizeof(struct syscall_tp)); @@ -498,16 +508,6 @@ static const char *clockid[] = { }; static DEFINE_STRARRAY(clockid); -static const char *socket_families[] = { - "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", - "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", - "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", - "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", - "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", - "ALG", "NFC", "VSOCK", -}; -static DEFINE_STRARRAY(socket_families); - static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, struct syscall_arg *arg) { @@ -615,6 +615,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); + unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val); void *parm; const char *name; bool show_zero; @@ -631,6 +632,8 @@ static struct syscall_fmt { } syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "bind", + .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, }, { .name = "bpf", .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, { .name = "brk", .hexret = true, @@ -645,6 +648,8 @@ static struct syscall_fmt { [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, { .name = "close", .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, + { .name = "connect", + .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, }, { .name = "epoll_ctl", .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", @@ -722,6 +727,10 @@ static struct syscall_fmt { .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, + { .name = "mount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ + .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, @@ -801,7 +810,8 @@ static struct syscall_fmt { { .name = "sendmsg", .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, { .name = "sendto", - .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, + .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, + [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .arg = { [0] = STRARRAY(which, itimers), }, }, @@ -830,6 +840,8 @@ static struct syscall_fmt { .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, + { .name = "umount2", .alias = "umount", + .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, @@ -853,16 +865,30 @@ static struct syscall_fmt *syscall_fmt__find(const char *name) return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } +static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +{ + int i, nmemb = ARRAY_SIZE(syscall_fmts); + + for (i = 0; i < nmemb; ++i) { + if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0) + return &syscall_fmts[i]; + } + + return NULL; +} + /* * is_exit: is this "exit" or "exit_group"? * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. + * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. */ struct syscall { - struct event_format *tp_format; + struct tep_event_format *tp_format; int nr_args; + int args_size; bool is_exit; bool is_open; - struct format_field *args; + struct tep_format_field *args; const char *name; struct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt; @@ -1095,11 +1121,21 @@ static void thread__set_filename_pos(struct thread *thread, const char *bf, ttrace->filename.entry_str_pos = bf - ttrace->entry_str; } +static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size) +{ + struct augmented_arg *augmented_arg = arg->augmented.args; + + return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value); +} + static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, struct syscall_arg *arg) { unsigned long ptr = arg->val; + if (arg->augmented.args) + return syscall_arg__scnprintf_augmented_string(arg, bf, size); + if (!arg->trace->vfs_getname) return scnprintf(bf, size, "%#x", ptr); @@ -1142,11 +1178,9 @@ static void sig_handler(int sig) interrupted = sig == SIGINT; } -static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, - u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) +static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) { - size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); - printed += fprintf_duration(duration, duration_calculated, fp); + size_t printed = 0; if (trace->multiple_threads) { if (trace->show_comm) @@ -1157,6 +1191,14 @@ static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thre return printed; } +static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, + u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) +{ + size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); + printed += fprintf_duration(duration, duration_calculated, fp); + return printed + trace__fprintf_comm_tid(trace, thread, fp); +} + static int trace__process_event(struct trace *trace, struct machine *machine, union perf_event *event, struct perf_sample *sample) { @@ -1258,10 +1300,12 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) static int syscall__set_arg_fmts(struct syscall *sc) { - struct format_field *field; + struct tep_format_field *field, *last_field = NULL; int idx = 0, len; for (field = sc->args; field; field = field->next, ++idx) { + last_field = field; + if (sc->fmt && sc->fmt->arg[idx].scnprintf) continue; @@ -1270,7 +1314,7 @@ static int syscall__set_arg_fmts(struct syscall *sc) strcmp(field->name, "path") == 0 || strcmp(field->name, "pathname") == 0)) sc->arg_fmt[idx].scnprintf = SCA_FILENAME; - else if (field->flags & FIELD_IS_POINTER) + else if (field->flags & TEP_FIELD_IS_POINTER) sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex; else if (strcmp(field->type, "pid_t") == 0) sc->arg_fmt[idx].scnprintf = SCA_PID; @@ -1292,6 +1336,9 @@ static int syscall__set_arg_fmts(struct syscall *sc) } } + if (last_field) + sc->args_size = last_field->offset + last_field->size; + return 0; } @@ -1459,6 +1506,19 @@ static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, return scnprintf(bf, size, "arg%d: ", arg->idx); } +/* + * Check if the value is in fact zero, i.e. mask whatever needs masking, such + * as mount 'flags' argument that needs ignoring some magic flag, see comment + * in tools/perf/trace/beauty/mount_flags.c + */ +static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val) +{ + if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val) + return sc->arg_fmt[arg->idx].mask_val(arg, val); + + return val; +} + static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, struct syscall_arg *arg, unsigned long val) { @@ -1472,14 +1532,18 @@ static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, } static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, - unsigned char *args, struct trace *trace, - struct thread *thread) + unsigned char *args, void *augmented_args, int augmented_args_size, + struct trace *trace, struct thread *thread) { size_t printed = 0; unsigned long val; u8 bit = 1; struct syscall_arg arg = { .args = args, + .augmented = { + .size = augmented_args_size, + .args = augmented_args, + }, .idx = 0, .mask = 0, .trace = trace, @@ -1495,7 +1559,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, ttrace->ret_scnprintf = NULL; if (sc->args != NULL) { - struct format_field *field; + struct tep_format_field *field; for (field = sc->args; field; field = field->next, ++arg.idx, bit <<= 1) { @@ -1503,6 +1567,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, continue; val = syscall_arg__val(&arg, arg.idx); + /* + * Some syscall args need some mask, most don't and + * return val untouched. + */ + val = syscall__mask_val(sc, &arg, val); /* * Suppress this argument if its value is zero and @@ -1634,6 +1703,8 @@ static int trace__printf_interrupted_entry(struct trace *trace) printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); ttrace->entry_pending = false; + ++trace->nr_events_printed; + return printed; } @@ -1654,6 +1725,32 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, return printed; } +static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented) +{ + void *augmented_args = NULL; + /* + * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter + * and there we get all 6 syscall args plus the tracepoint common + * fields (sizeof(long)) and the syscall_nr (another long). So we check + * if that is the case and if so don't look after the sc->args_size, + * but always after the full raw_syscalls:sys_enter payload, which is + * fixed. + * + * We'll revisit this later to pass s->args_size to the BPF augmenter + * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it + * copies only what we need for each syscall, like what happens when we + * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace + * traffic to just what is needed for each syscall. + */ + int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size; + + *augmented_args_size = sample->raw_size - args_size; + if (*augmented_args_size > 0) + augmented_args = sample->raw_data + args_size; + + return augmented_args; +} + static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) @@ -1663,6 +1760,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, size_t printed = 0; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; + int augmented_args_size = 0; + void *augmented_args = NULL; struct syscall *sc = trace__syscall_info(trace, evsel, id); struct thread_trace *ttrace; @@ -1686,13 +1785,24 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) trace__printf_interrupted_entry(trace); - + /* + * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible + * arguments, even if the syscall being handled, say "openat", uses only 4 arguments + * this breaks syscall__augmented_args() check for augmented args, as we calculate + * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, + * so when handling, say the openat syscall, we end up getting 6 args for the + * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly + * thinking that the extra 2 u64 args are the augmented filename, so just check + * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. + */ + if (evsel != trace->syscalls.events.sys_enter) + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); ttrace->entry_time = sample->time; msg = ttrace->entry_str; printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, - args, trace, thread); + args, augmented_args, augmented_args_size, trace, thread); if (sc->is_exit) { if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) { @@ -1723,7 +1833,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; struct syscall *sc = trace__syscall_info(trace, evsel, id); char msg[1024]; - void *args; + void *args, *augmented_args = NULL; + int augmented_args_size; if (sc == NULL) return -1; @@ -1738,7 +1849,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse goto out_put; args = perf_evsel__sc_tp_ptr(evsel, args, sample); - syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread); + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); + syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); fprintf(trace->output, "%s", msg); err = 0; out_put: @@ -1754,12 +1866,14 @@ static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evse int max_stack = evsel->attr.sample_max_stack ? evsel->attr.sample_max_stack : trace->max_stack; + int err; - if (machine__resolve(trace->host, &al, sample) < 0 || - thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack)) + if (machine__resolve(trace->host, &al, sample) < 0) return -1; - return 0; + err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); + addr_location__put(&al); + return err; } static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) @@ -1884,6 +1998,13 @@ errno_print: { fputc('\n', trace->output); + /* + * We only consider an 'event' for the sake of --max-events a non-filtered + * sys_enter + sys_exit and other tracepoint events. + */ + if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; + if (callchain_ret > 0) trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) @@ -2016,13 +2137,25 @@ static void bpf_output__fprintf(struct trace *trace, { binary__fprintf(sample->raw_data, sample->raw_size, 8, bpf_output__printer, NULL, trace->output); + ++trace->nr_events_printed; } static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { + struct thread *thread; int callchain_ret = 0; + /* + * Check if we called perf_evsel__disable(evsel) due to, for instance, + * this event's max_events having been hit and this is an entry coming + * from the ring buffer that we should discard, since the max events + * have already been considered/printed. + */ + if (evsel->disabled) + return 0; + + thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); @@ -2039,22 +2172,47 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, if (trace->trace_syscalls) fprintf(trace->output, "( ): "); + if (thread) + trace__fprintf_comm_tid(trace, thread, trace->output); + + if (evsel == trace->syscalls.events.augmented) { + int id = perf_evsel__sc_tp_uint(evsel, id, sample); + struct syscall *sc = trace__syscall_info(trace, evsel, id); + + if (sc) { + fprintf(trace->output, "%s(", sc->name); + trace__fprintf_sys_enter(trace, evsel, sample); + fputc(')', trace->output); + goto newline; + } + + /* + * XXX: Not having the associated syscall info or not finding/adding + * the thread should never happen, but if it does... + * fall thru and print it as a bpf_output event. + */ + } + fprintf(trace->output, "%s:", evsel->name); if (perf_evsel__is_bpf_output(evsel)) { - if (evsel == trace->syscalls.events.augmented) - trace__fprintf_sys_enter(trace, evsel, sample); - else - bpf_output__fprintf(trace, sample); + bpf_output__fprintf(trace, sample); } else if (evsel->tp_format) { if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || trace__fprintf_sys_enter(trace, evsel, sample)) { event_format__fprintf(evsel->tp_format, sample->cpu, sample->raw_data, sample->raw_size, trace->output); + ++trace->nr_events_printed; + + if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) { + perf_evsel__disable(evsel); + perf_evsel__close(evsel); + } } } +newline: fprintf(trace->output, "\n"); if (callchain_ret > 0) @@ -2062,6 +2220,7 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: + thread__put(thread); return 0; } @@ -2148,6 +2307,8 @@ static int trace__pgfault(struct trace *trace, trace__fprintf_callchain(trace, sample); else if (callchain_ret < 0) pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); + + ++trace->nr_events_printed; out: err = 0; out_put: @@ -2325,6 +2486,9 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st tracepoint_handler handler = evsel->handler; handler(trace, evsel, event, sample); } + + if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX) + interrupted = true; } static int trace__add_syscall_newtp(struct trace *trace) @@ -2629,7 +2793,7 @@ next_event: int timeout = done ? 100 : -1; if (!draining && perf_evlist__poll(evlist, timeout) > 0) { - if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) + if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) draining = true; goto again; @@ -3061,6 +3225,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; + struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; @@ -3078,12 +3243,19 @@ static int trace__parse_events_option(const struct option *opt, const char *str, if (syscalltbl__id(trace->sctbl, s) >= 0 || syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { list = 1; + goto do_concat; + } + + fmt = syscall_fmt__find_by_alias(s); + if (fmt != NULL) { + list = 1; + s = fmt->name; } else { path__join(group_name, sizeof(group_name), strace_groups_dir, s); if (access(group_name, R_OK) == 0) list = 1; } - +do_concat: if (lists[list]) { sprintf(lists[list] + strlen(lists[list]), ",%s", s); } else { @@ -3172,6 +3344,7 @@ int cmd_trace(int argc, const char **argv) .trace_syscalls = false, .kernel_syscallchains = false, .max_stack = UINT_MAX, + .max_events = ULONG_MAX, }; const char *output_name = NULL; const struct option trace_options[] = { @@ -3224,6 +3397,8 @@ int cmd_trace(int argc, const char **argv) &record_parse_callchain_opt), OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, "Show the kernel callchains on the syscall exit path"), + OPT_ULONG(0, "max-events", &trace.max_events, + "Set the maximum number of events to print, exit after that is reached. "), OPT_UINTEGER(0, "min-stack", &trace.min_stack, "Set the minimum stack depth when parsing the callchain, " "anything below the specified depth will be ignored."), @@ -3276,12 +3451,8 @@ int cmd_trace(int argc, const char **argv) goto out; } - if (evsel) { - if (perf_evsel__init_augmented_syscall_tp(evsel) || - perf_evsel__init_augmented_syscall_tp_args(evsel)) - goto out; + if (evsel) trace.syscalls.events.augmented = evsel; - } err = bpf__setup_stdout(trace.evlist); if (err) { @@ -3326,6 +3497,42 @@ int cmd_trace(int argc, const char **argv) } } + /* + * If we are augmenting syscalls, then combine what we put in the + * __augmented_syscalls__ BPF map with what is in the + * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF, + * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit. + * + * We'll switch to look at two BPF maps, one for sys_enter and the + * other for sys_exit when we start augmenting the sys_exit paths with + * buffers that are being copied from kernel to userspace, think 'read' + * syscall. + */ + if (trace.syscalls.events.augmented) { + evsel = trace.syscalls.events.augmented; + + if (perf_evsel__init_augmented_syscall_tp(evsel) || + perf_evsel__init_augmented_syscall_tp_args(evsel)) + goto out; + evsel->handler = trace__sys_enter; + + evlist__for_each_entry(trace.evlist, evsel) { + bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + + if (raw_syscalls_sys_exit) { + trace.raw_augmented_syscalls = true; + goto init_augmented_syscall_tp; + } + + if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { +init_augmented_syscall_tp: + perf_evsel__init_augmented_syscall_tp(evsel); + perf_evsel__init_augmented_syscall_tp_ret(evsel); + evsel->handler = trace__sys_exit; + } + } + } + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) return trace__record(&trace, argc-1, &argv[1]); diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 466540ee8ea7..9531f7bd7d9b 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -5,6 +5,7 @@ HEADERS=' include/uapi/drm/drm.h include/uapi/drm/i915_drm.h include/uapi/linux/fcntl.h +include/uapi/linux/fs.h include/uapi/linux/kcmp.h include/uapi/linux/kvm.h include/uapi/linux/in.h @@ -14,6 +15,7 @@ include/uapi/linux/sched.h include/uapi/linux/stat.h include/uapi/linux/vhost.h include/uapi/sound/asound.h +include/linux/bits.h include/linux/hash.h include/uapi/linux/hw_breakpoint.h arch/x86/include/asm/disabled-features.h diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index 2d0caf20ff3a..bc6c585f74fc 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -30,3 +30,4 @@ perf-test mainporcelain common perf-timechart mainporcelain common perf-top mainporcelain common perf-trace mainporcelain audit +perf-version mainporcelain common diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c new file mode 100644 index 000000000000..90a19336310b --- /dev/null +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. + * + * Test it with: + * + * perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null + * + * This exactly matches what is marshalled into the raw_syscall:sys_enter + * payload expected by the 'perf trace' beautifiers. + * + * For now it just uses the existing tracepoint augmentation code in 'perf + * trace', in the next csets we'll hook up these with the sys_enter/sys_exit + * code that will combine entry/exit in a strace like way. + */ + +#include +#include + +/* bpf-output associated map */ +struct bpf_map SEC("maps") __augmented_syscalls__ = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = __NR_CPUS__, +}; + +struct syscall_enter_args { + unsigned long long common_tp_fields; + long syscall_nr; + unsigned long args[6]; +}; + +struct syscall_exit_args { + unsigned long long common_tp_fields; + long syscall_nr; + long ret; +}; + +struct augmented_filename { + unsigned int size; + int reserved; + char value[256]; +}; + +#define SYS_OPEN 2 +#define SYS_OPENAT 257 + +SEC("raw_syscalls:sys_enter") +int sys_enter(struct syscall_enter_args *args) +{ + struct { + struct syscall_enter_args args; + struct augmented_filename filename; + } augmented_args; + unsigned int len = sizeof(augmented_args); + const void *filename_arg = NULL; + + probe_read(&augmented_args.args, sizeof(augmented_args.args), args); + /* + * Yonghong and Edward Cree sayz: + * + * https://www.spinics.net/lists/netdev/msg531645.html + * + * >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1 + * >> 10: (bf) r1 = r6 + * >> 11: (07) r1 += 16 + * >> 12: (05) goto pc+2 + * >> 15: (79) r3 = *(u64 *)(r1 +0) + * >> dereference of modified ctx ptr R1 off=16 disallowed + * > Aha, we at least got a different error message this time. + * > And indeed llvm has done that optimisation, rather than the more obvious + * > 11: r3 = *(u64 *)(r1 +16) + * > because it wants to have lots of reads share a single insn. You may be able + * > to defeat that optimisation by adding compiler barriers, idk. Maybe someone + * > with llvm knowledge can figure out how to stop it (ideally, llvm would know + * > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯ + * + * The optimization mostly likes below: + * + * br1: + * ... + * r1 += 16 + * goto merge + * br2: + * ... + * r1 += 20 + * goto merge + * merge: + * *(u64 *)(r1 + 0) + * + * The compiler tries to merge common loads. There is no easy way to + * stop this compiler optimization without turning off a lot of other + * optimizations. The easiest way is to add barriers: + * + * __asm__ __volatile__("": : :"memory") + * + * after the ctx memory access to prevent their down stream merging. + */ + switch (augmented_args.args.syscall_nr) { + case SYS_OPEN: filename_arg = (const void *)args->args[0]; + __asm__ __volatile__("": : :"memory"); + break; + case SYS_OPENAT: filename_arg = (const void *)args->args[1]; + break; + } + + if (filename_arg != NULL) { + augmented_args.filename.reserved = 0; + augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, + sizeof(augmented_args.filename.value), + filename_arg); + if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) { + len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size; + len &= sizeof(augmented_args.filename.value) - 1; + } + } else { + len = sizeof(augmented_args.args); + } + + perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len); + return 0; +} + +SEC("raw_syscalls:sys_exit") +int sys_exit(struct syscall_exit_args *args) +{ + return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */ +} + +license(GPL); diff --git a/tools/perf/examples/bpf/augmented_syscalls.c b/tools/perf/examples/bpf/augmented_syscalls.c index 69a31386d8cd..2ae44813ef2d 100644 --- a/tools/perf/examples/bpf/augmented_syscalls.c +++ b/tools/perf/examples/bpf/augmented_syscalls.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Augment the openat syscall with the contents of the filename pointer argument. + * Augment syscalls with the contents of the pointer arguments. * * Test it with: * @@ -10,15 +10,14 @@ * the last one should be the one for '/etc/passwd'. * * This matches what is marshalled into the raw_syscall:sys_enter payload - * expected by the 'perf trace' beautifiers, and can be used by them unmodified, - * which will be done as that feature is implemented in the next csets, for now - * it will appear in a dump done by the default tracepoint handler in 'perf trace', - * that uses bpf_output__fprintf() to just dump those contents, as done with - * the bpf-output event associated with the __bpf_output__ map declared in - * tools/perf/include/bpf/stdio.h. + * expected by the 'perf trace' beautifiers, and can be used by them, that will + * check if perf_sample->raw_data is more than what is expected for each + * syscalls:sys_{enter,exit}_SYSCALL tracepoint, uing the extra data as the + * contents of pointer arguments. */ #include +#include struct bpf_map SEC("maps") __augmented_syscalls__ = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, @@ -27,6 +26,44 @@ struct bpf_map SEC("maps") __augmented_syscalls__ = { .max_entries = __NR_CPUS__, }; +struct syscall_exit_args { + unsigned long long common_tp_fields; + long syscall_nr; + long ret; +}; + +struct augmented_filename { + unsigned int size; + int reserved; + char value[256]; +}; + +#define augmented_filename_syscall(syscall) \ +struct augmented_enter_##syscall##_args { \ + struct syscall_enter_##syscall##_args args; \ + struct augmented_filename filename; \ +}; \ +int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ +{ \ + struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; \ + unsigned int len = sizeof(augmented_args); \ + probe_read(&augmented_args.args, sizeof(augmented_args.args), args); \ + augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, \ + sizeof(augmented_args.filename.value), \ + args->filename_ptr); \ + if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) { \ + len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size; \ + len &= sizeof(augmented_args.filename.value) - 1; \ + } \ + perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, len); \ + return 0; \ +} \ +int syscall_exit(syscall)(struct syscall_exit_args *args) \ +{ \ + return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */ \ +} + struct syscall_enter_openat_args { unsigned long long common_tp_fields; long syscall_nr; @@ -36,20 +73,101 @@ struct syscall_enter_openat_args { long mode; }; -struct augmented_enter_openat_args { - struct syscall_enter_openat_args args; - char filename[64]; +augmented_filename_syscall(openat); + +struct syscall_enter_open_args { + unsigned long long common_tp_fields; + long syscall_nr; + char *filename_ptr; + long flags; + long mode; +}; + +augmented_filename_syscall(open); + +struct syscall_enter_inotify_add_watch_args { + unsigned long long common_tp_fields; + long syscall_nr; + long fd; + char *filename_ptr; + long mask; +}; + +augmented_filename_syscall(inotify_add_watch); + +struct statbuf; + +struct syscall_enter_newstat_args { + unsigned long long common_tp_fields; + long syscall_nr; + char *filename_ptr; + struct stat *statbuf; }; -int syscall_enter(openat)(struct syscall_enter_openat_args *args) -{ - struct augmented_enter_openat_args augmented_args; +augmented_filename_syscall(newstat); + +#ifndef _K_SS_MAXSIZE +#define _K_SS_MAXSIZE 128 +#endif - probe_read(&augmented_args.args, sizeof(augmented_args.args), args); - probe_read_str(&augmented_args.filename, sizeof(augmented_args.filename), args->filename_ptr); - perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, - &augmented_args, sizeof(augmented_args)); - return 1; +#define augmented_sockaddr_syscall(syscall) \ +struct augmented_enter_##syscall##_args { \ + struct syscall_enter_##syscall##_args args; \ + struct sockaddr_storage addr; \ +}; \ +int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ +{ \ + struct augmented_enter_##syscall##_args augmented_args; \ + unsigned long addrlen = sizeof(augmented_args.addr); \ + probe_read(&augmented_args.args, sizeof(augmented_args.args), args); \ +/* FIXME_CLANG_OPTIMIZATION_THAT_ACCESSES_USER_CONTROLLED_ADDRLEN_DESPITE_THIS_CHECK */ \ +/* if (addrlen > augmented_args.args.addrlen) */ \ +/* addrlen = augmented_args.args.addrlen; */ \ +/* */ \ + probe_read(&augmented_args.addr, addrlen, args->addr_ptr); \ + perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, \ + sizeof(augmented_args) - sizeof(augmented_args.addr) + addrlen); \ + return 0; \ +} \ +int syscall_exit(syscall)(struct syscall_exit_args *args) \ +{ \ + return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */ \ } +struct sockaddr; + +struct syscall_enter_bind_args { + unsigned long long common_tp_fields; + long syscall_nr; + long fd; + struct sockaddr *addr_ptr; + unsigned long addrlen; +}; + +augmented_sockaddr_syscall(bind); + +struct syscall_enter_connect_args { + unsigned long long common_tp_fields; + long syscall_nr; + long fd; + struct sockaddr *addr_ptr; + unsigned long addrlen; +}; + +augmented_sockaddr_syscall(connect); + +struct syscall_enter_sendto_args { + unsigned long long common_tp_fields; + long syscall_nr; + long fd; + void *buff; + long len; + unsigned long flags; + struct sockaddr *addr_ptr; + long addr_len; +}; + +augmented_sockaddr_syscall(sendto); + license(GPL); diff --git a/tools/perf/examples/bpf/etcsnoop.c b/tools/perf/examples/bpf/etcsnoop.c new file mode 100644 index 000000000000..b59e8812ee8c --- /dev/null +++ b/tools/perf/examples/bpf/etcsnoop.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Augment the filename syscalls with the contents of the filename pointer argument + * filtering only those that do not start with /etc/. + * + * Test it with: + * + * perf trace -e tools/perf/examples/bpf/augmented_syscalls.c cat /etc/passwd > /dev/null + * + * It'll catch some openat syscalls related to the dynamic linked and + * the last one should be the one for '/etc/passwd'. + * + * This matches what is marshalled into the raw_syscall:sys_enter payload + * expected by the 'perf trace' beautifiers, and can be used by them unmodified, + * which will be done as that feature is implemented in the next csets, for now + * it will appear in a dump done by the default tracepoint handler in 'perf trace', + * that uses bpf_output__fprintf() to just dump those contents, as done with + * the bpf-output event associated with the __bpf_output__ map declared in + * tools/perf/include/bpf/stdio.h. + */ + +#include + +struct bpf_map SEC("maps") __augmented_syscalls__ = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = __NR_CPUS__, +}; + +struct augmented_filename { + int size; + int reserved; + char value[64]; +}; + +#define augmented_filename_syscall_enter(syscall) \ +struct augmented_enter_##syscall##_args { \ + struct syscall_enter_##syscall##_args args; \ + struct augmented_filename filename; \ +}; \ +int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ +{ \ + char etc[6] = "/etc/"; \ + struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; \ + probe_read(&augmented_args.args, sizeof(augmented_args.args), args); \ + augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, \ + sizeof(augmented_args.filename.value), \ + args->filename_ptr); \ + if (__builtin_memcmp(augmented_args.filename.value, etc, 4) != 0) \ + return 0; \ + perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, \ + (sizeof(augmented_args) - sizeof(augmented_args.filename.value) + \ + augmented_args.filename.size)); \ + return 0; \ +} + +struct syscall_enter_openat_args { + unsigned long long common_tp_fields; + long syscall_nr; + long dfd; + char *filename_ptr; + long flags; + long mode; +}; + +augmented_filename_syscall_enter(openat); + +struct syscall_enter_open_args { + unsigned long long common_tp_fields; + long syscall_nr; + char *filename_ptr; + long flags; + long mode; +}; + +augmented_filename_syscall_enter(open); + +license(GPL); diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h index 47897d65e799..52b6d87fe822 100644 --- a/tools/perf/include/bpf/bpf.h +++ b/tools/perf/include/bpf/bpf.h @@ -26,6 +26,9 @@ struct bpf_map { #define syscall_enter(name) \ SEC("syscalls:sys_enter_" #name) syscall_enter_ ## name +#define syscall_exit(name) \ + SEC("syscalls:sys_exit_" #name) syscall_exit_ ## name + #define license(name) \ char _license[] SEC("license") = #name; \ int _version SEC("version") = LINUX_VERSION_CODE; diff --git a/tools/perf/include/bpf/linux/socket.h b/tools/perf/include/bpf/linux/socket.h new file mode 100644 index 000000000000..7f844568dab8 --- /dev/null +++ b/tools/perf/include/bpf/linux/socket.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_SOCKET_H +#define _UAPI_LINUX_SOCKET_H + +/* + * Desired design of maximum size and alignment (see RFC2553) + */ +#define _K_SS_MAXSIZE 128 /* Implementation specific max size */ +#define _K_SS_ALIGNSIZE (__alignof__ (struct sockaddr *)) + /* Implementation specific desired alignment */ + +typedef unsigned short __kernel_sa_family_t; + +struct __kernel_sockaddr_storage { + __kernel_sa_family_t ss_family; /* address family */ + /* Following field(s) are implementation specific */ + char __data[_K_SS_MAXSIZE - sizeof(unsigned short)]; + /* space to achieve desired size, */ + /* _SS_MAXSIZE value minus size of ss_family */ +} __attribute__ ((aligned(_K_SS_ALIGNSIZE))); /* force desired alignment */ + +#define sockaddr_storage __kernel_sockaddr_storage + +#endif /* _UAPI_LINUX_SOCKET_H */ diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c index ac1bcdc17dae..f7eb63cbbc65 100644 --- a/tools/perf/jvmti/jvmti_agent.c +++ b/tools/perf/jvmti/jvmti_agent.c @@ -125,7 +125,7 @@ perf_get_timestamp(void) } static int -debug_cache_init(void) +create_jit_cache_dir(void) { char str[32]; char *base, *p; @@ -144,8 +144,13 @@ debug_cache_init(void) strftime(str, sizeof(str), JIT_LANG"-jit-%Y%m%d", &tm); - snprintf(jit_path, PATH_MAX - 1, "%s/.debug/", base); - + ret = snprintf(jit_path, PATH_MAX, "%s/.debug/", base); + if (ret >= PATH_MAX) { + warnx("jvmti: cannot generate jit cache dir because %s/.debug/" + " is too long, please check the cwd, JITDUMPDIR, and" + " HOME variables", base); + return -1; + } ret = mkdir(jit_path, 0755); if (ret == -1) { if (errno != EEXIST) { @@ -154,20 +159,32 @@ debug_cache_init(void) } } - snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit", base); + ret = snprintf(jit_path, PATH_MAX, "%s/.debug/jit", base); + if (ret >= PATH_MAX) { + warnx("jvmti: cannot generate jit cache dir because" + " %s/.debug/jit is too long, please check the cwd," + " JITDUMPDIR, and HOME variables", base); + return -1; + } ret = mkdir(jit_path, 0755); if (ret == -1) { if (errno != EEXIST) { - warn("cannot create jit cache dir %s", jit_path); + warn("jvmti: cannot create jit cache dir %s", jit_path); return -1; } } - snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit/%s.XXXXXXXX", base, str); - + ret = snprintf(jit_path, PATH_MAX, "%s/.debug/jit/%s.XXXXXXXX", base, str); + if (ret >= PATH_MAX) { + warnx("jvmti: cannot generate jit cache dir because" + " %s/.debug/jit/%s.XXXXXXXX is too long, please check" + " the cwd, JITDUMPDIR, and HOME variables", + base, str); + return -1; + } p = mkdtemp(jit_path); if (p != jit_path) { - warn("cannot create jit cache dir %s", jit_path); + warn("jvmti: cannot create jit cache dir %s", jit_path); return -1; } @@ -228,7 +245,7 @@ void *jvmti_open(void) { char dump_path[PATH_MAX]; struct jitheader header; - int fd; + int fd, ret; FILE *fp; init_arch_timestamp(); @@ -245,12 +262,22 @@ void *jvmti_open(void) memset(&header, 0, sizeof(header)); - debug_cache_init(); + /* + * jitdump file dir + */ + if (create_jit_cache_dir() < 0) + return NULL; /* * jitdump file name */ - scnprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid()); + ret = snprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid()); + if (ret >= PATH_MAX) { + warnx("jvmti: cannot generate jitdump file full path because" + " %s/jit-%i.dump is too long, please check the cwd," + " JITDUMPDIR, and HOME variables", jit_path, getpid()); + return NULL; + } fd = open(dump_path, O_CREAT|O_TRUNC|O_RDWR, 0666); if (fd == -1) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 21bf7f5a3cf5..0ed4a34c74c4 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -81,6 +81,7 @@ struct record_opts { unsigned initial_delay; bool use_clockid; clockid_t clockid; + u64 clockid_res_ns; unsigned int proc_map_timeout; }; diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/branch.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/branch.json new file mode 100644 index 000000000000..abc98b018446 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/branch.json @@ -0,0 +1,23 @@ +[ + { + "ArchStdEvent": "BR_IMMED_SPEC", + }, + { + "ArchStdEvent": "BR_RETURN_SPEC", + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC", + }, + { + "PublicDescription": "Mispredicted or not predicted branch speculatively executed", + "EventCode": "0x10", + "EventName": "BR_MIS_PRED", + "BriefDescription": "Branch mispredicted" + }, + { + "PublicDescription": "Predictable branch speculatively executed", + "EventCode": "0x12", + "EventName": "BR_PRED", + "BriefDescription": "Predictable branch" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/bus.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/bus.json new file mode 100644 index 000000000000..687b2629e1d1 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/bus.json @@ -0,0 +1,26 @@ +[ + { + "ArchStdEvent": "BUS_ACCESS_RD", + }, + { + "ArchStdEvent": "BUS_ACCESS_WR", + }, + { + "ArchStdEvent": "BUS_ACCESS_SHARED", + }, + { + "ArchStdEvent": "BUS_ACCESS_NOT_SHARED", + }, + { + "ArchStdEvent": "BUS_ACCESS_NORMAL", + }, + { + "ArchStdEvent": "BUS_ACCESS_PERIPH", + }, + { + "PublicDescription": "Bus access", + "EventCode": "0x19", + "EventName": "BUS_ACCESS", + "BriefDescription": "Bus access" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/cache.json new file mode 100644 index 000000000000..df9201434cb6 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/cache.json @@ -0,0 +1,191 @@ +[ + { + "ArchStdEvent": "L1D_CACHE_RD", + }, + { + "ArchStdEvent": "L1D_CACHE_WR", + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_RD", + }, + { + "ArchStdEvent": "L1D_CACHE_INVAL", + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_RD", + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_WR", + }, + { + "ArchStdEvent": "L2D_CACHE_RD", + }, + { + "ArchStdEvent": "L2D_CACHE_WR", + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_RD", + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_WR", + }, + { + "ArchStdEvent": "L2D_CACHE_WB_VICTIM", + }, + { + "ArchStdEvent": "L2D_CACHE_WB_CLEAN", + }, + { + "ArchStdEvent": "L2D_CACHE_INVAL", + }, + { + "PublicDescription": "Level 1 instruction cache refill", + "EventCode": "0x01", + "EventName": "L1I_CACHE_REFILL", + "BriefDescription": "L1I cache refill" + }, + { + "PublicDescription": "Level 1 instruction TLB refill", + "EventCode": "0x02", + "EventName": "L1I_TLB_REFILL", + "BriefDescription": "L1I TLB refill" + }, + { + "PublicDescription": "Level 1 data cache refill", + "EventCode": "0x03", + "EventName": "L1D_CACHE_REFILL", + "BriefDescription": "L1D cache refill" + }, + { + "PublicDescription": "Level 1 data cache access", + "EventCode": "0x04", + "EventName": "L1D_CACHE_ACCESS", + "BriefDescription": "L1D cache access" + }, + { + "PublicDescription": "Level 1 data TLB refill", + "EventCode": "0x05", + "EventName": "L1D_TLB_REFILL", + "BriefDescription": "L1D TLB refill" + }, + { + "PublicDescription": "Level 1 instruction cache access", + "EventCode": "0x14", + "EventName": "L1I_CACHE_ACCESS", + "BriefDescription": "L1I cache access" + }, + { + "PublicDescription": "Level 2 data cache access", + "EventCode": "0x16", + "EventName": "L2D_CACHE_ACCESS", + "BriefDescription": "L2D cache access" + }, + { + "PublicDescription": "Level 2 data refill", + "EventCode": "0x17", + "EventName": "L2D_CACHE_REFILL", + "BriefDescription": "L2D cache refill" + }, + { + "PublicDescription": "Level 2 data cache, Write-Back", + "EventCode": "0x18", + "EventName": "L2D_CACHE_WB", + "BriefDescription": "L2D cache Write-Back" + }, + { + "PublicDescription": "Level 1 data TLB access. This event counts any load or store operation which accesses the data L1 TLB", + "EventCode": "0x25", + "EventName": "L1D_TLB_ACCESS", + "BriefDescription": "L1D TLB access" + }, + { + "PublicDescription": "Level 1 instruction TLB access. This event counts any instruction fetch which accesses the instruction L1 TLB", + "EventCode": "0x26", + "EventName": "L1I_TLB_ACCESS", + "BriefDescription": "L1I TLB access" + }, + { + "PublicDescription": "Level 2 access to data TLB that caused a page table walk. This event counts on any data access which causes L2D_TLB_REFILL to count", + "EventCode": "0x34", + "EventName": "L2D_TLB_ACCESS", + "BriefDescription": "L2D TLB access" + }, + { + "PublicDescription": "Level 2 access to instruciton TLB that caused a page table walk. This event counts on any instruciton access which causes L2I_TLB_REFILL to count", + "EventCode": "0x35", + "EventName": "L2I_TLB_ACCESS", + "BriefDescription": "L2D TLB access" + }, + { + "PublicDescription": "Branch target buffer misprediction", + "EventCode": "0x102", + "EventName": "BTB_MIS_PRED", + "BriefDescription": "BTB misprediction" + }, + { + "PublicDescription": "ITB miss", + "EventCode": "0x103", + "EventName": "ITB_MISS", + "BriefDescription": "ITB miss" + }, + { + "PublicDescription": "DTB miss", + "EventCode": "0x104", + "EventName": "DTB_MISS", + "BriefDescription": "DTB miss" + }, + { + "PublicDescription": "Level 1 data cache late miss", + "EventCode": "0x105", + "EventName": "L1D_CACHE_LATE_MISS", + "BriefDescription": "L1D cache late miss" + }, + { + "PublicDescription": "Level 1 data cache prefetch request", + "EventCode": "0x106", + "EventName": "L1D_CACHE_PREFETCH", + "BriefDescription": "L1D cache prefetch" + }, + { + "PublicDescription": "Level 2 data cache prefetch request", + "EventCode": "0x107", + "EventName": "L2D_CACHE_PREFETCH", + "BriefDescription": "L2D cache prefetch" + }, + { + "PublicDescription": "Level 1 stage 2 TLB refill", + "EventCode": "0x111", + "EventName": "L1_STAGE2_TLB_REFILL", + "BriefDescription": "L1 stage 2 TLB refill" + }, + { + "PublicDescription": "Page walk cache level-0 stage-1 hit", + "EventCode": "0x112", + "EventName": "PAGE_WALK_L0_STAGE1_HIT", + "BriefDescription": "Page walk, L0 stage-1 hit" + }, + { + "PublicDescription": "Page walk cache level-1 stage-1 hit", + "EventCode": "0x113", + "EventName": "PAGE_WALK_L1_STAGE1_HIT", + "BriefDescription": "Page walk, L1 stage-1 hit" + }, + { + "PublicDescription": "Page walk cache level-2 stage-1 hit", + "EventCode": "0x114", + "EventName": "PAGE_WALK_L2_STAGE1_HIT", + "BriefDescription": "Page walk, L2 stage-1 hit" + }, + { + "PublicDescription": "Page walk cache level-1 stage-2 hit", + "EventCode": "0x115", + "EventName": "PAGE_WALK_L1_STAGE2_HIT", + "BriefDescription": "Page walk, L1 stage-2 hit" + }, + { + "PublicDescription": "Page walk cache level-2 stage-2 hit", + "EventCode": "0x116", + "EventName": "PAGE_WALK_L2_STAGE2_HIT", + "BriefDescription": "Page walk, L2 stage-2 hit" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/clock.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/clock.json new file mode 100644 index 000000000000..38cd1f1a70dc --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/clock.json @@ -0,0 +1,20 @@ +[ + { + "PublicDescription": "The number of core clock cycles", + "EventCode": "0x11", + "EventName": "CPU_CYCLES", + "BriefDescription": "Clock cycles" + }, + { + "PublicDescription": "FSU clocking gated off cycle", + "EventCode": "0x101", + "EventName": "FSU_CLOCK_OFF_CYCLES", + "BriefDescription": "FSU clocking gated off cycle" + }, + { + "PublicDescription": "Wait state cycle", + "EventCode": "0x110", + "EventName": "Wait_CYCLES", + "BriefDescription": "Wait state cycle" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/core-imp-def.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/core-imp-def.json deleted file mode 100644 index bc03c06c3918..000000000000 --- a/tools/perf/pmu-events/arch/arm64/ampere/emag/core-imp-def.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "ArchStdEvent": "L1D_CACHE_RD", - }, - { - "ArchStdEvent": "L1D_CACHE_WR", - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_RD", - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_WR", - }, - { - "ArchStdEvent": "L1D_TLB_REFILL_RD", - }, - { - "ArchStdEvent": "L1D_TLB_REFILL_WR", - }, - { - "ArchStdEvent": "L1D_TLB_RD", - }, - { - "ArchStdEvent": "L1D_TLB_WR", - }, - { - "ArchStdEvent": "BUS_ACCESS_RD", - }, - { - "ArchStdEvent": "BUS_ACCESS_WR", - } -] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/exception.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/exception.json new file mode 100644 index 000000000000..3720dc28a15f --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/exception.json @@ -0,0 +1,50 @@ +[ + { + "ArchStdEvent": "EXC_UNDEF", + }, + { + "ArchStdEvent": "EXC_SVC", + }, + { + "ArchStdEvent": "EXC_PABORT", + }, + { + "ArchStdEvent": "EXC_DABORT", + }, + { + "ArchStdEvent": "EXC_IRQ", + }, + { + "ArchStdEvent": "EXC_FIQ", + }, + { + "ArchStdEvent": "EXC_HVC", + }, + { + "ArchStdEvent": "EXC_TRAP_PABORT", + }, + { + "ArchStdEvent": "EXC_TRAP_DABORT", + }, + { + "ArchStdEvent": "EXC_TRAP_OTHER", + }, + { + "ArchStdEvent": "EXC_TRAP_IRQ", + }, + { + "ArchStdEvent": "EXC_TRAP_FIQ", + }, + { + "PublicDescription": "Exception taken", + "EventCode": "0x09", + "EventName": "EXC_TAKEN", + "BriefDescription": "Exception taken" + }, + { + "PublicDescription": "Instruction architecturally executed, condition check pass, exception return", + "EventCode": "0x0a", + "EventName": "EXC_RETURN", + "BriefDescription": "Exception return" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/instruction.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/instruction.json new file mode 100644 index 000000000000..82cf753e6472 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/instruction.json @@ -0,0 +1,89 @@ +[ + { + "ArchStdEvent": "LD_SPEC", + }, + { + "ArchStdEvent": "ST_SPEC", + }, + { + "ArchStdEvent": "LDST_SPEC", + }, + { + "ArchStdEvent": "DP_SPEC", + }, + { + "ArchStdEvent": "ASE_SPEC", + }, + { + "ArchStdEvent": "VFP_SPEC", + }, + { + "ArchStdEvent": "PC_WRITE_SPEC", + }, + { + "ArchStdEvent": "CRYPTO_SPEC", + }, + { + "ArchStdEvent": "ISB_SPEC", + }, + { + "ArchStdEvent": "DSB_SPEC", + }, + { + "ArchStdEvent": "DMB_SPEC", + }, + { + "ArchStdEvent": "RC_LD_SPEC", + }, + { + "ArchStdEvent": "RC_ST_SPEC", + }, + { + "PublicDescription": "Instruction architecturally executed, software increment", + "EventCode": "0x00", + "EventName": "SW_INCR", + "BriefDescription": "Software increment" + }, + { + "PublicDescription": "Instruction architecturally executed", + "EventCode": "0x08", + "EventName": "INST_RETIRED", + "BriefDescription": "Instruction retired" + }, + { + "PublicDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR", + "EventCode": "0x0b", + "EventName": "CID_WRITE_RETIRED", + "BriefDescription": "Write to CONTEXTIDR" + }, + { + "PublicDescription": "Operation speculatively executed", + "EventCode": "0x1b", + "EventName": "INST_SPEC", + "BriefDescription": "Speculatively executed" + }, + { + "PublicDescription": "Instruction architecturally executed (condition check pass), write to TTBR", + "EventCode": "0x1c", + "EventName": "TTBR_WRITE_RETIRED", + "BriefDescription": "Instruction executed, TTBR write" + }, + { + "PublicDescription": "Instruction architecturally executed, branch. This event counts all branches, taken or not. This excludes exception entries, debug entries and CCFAIL branches", + "EventCode": "0x21", + "EventName": "BR_RETIRED", + "BriefDescription": "Branch retired" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted branch. This event counts any branch counted by BR_RETIRED which is not correctly predicted and causes a pipeline flush", + "EventCode": "0x22", + "EventName": "BR_MISPRED_RETIRED", + "BriefDescription": "Mispredicted branch retired" + }, + { + "PublicDescription": "Operation speculatively executed, NOP", + "EventCode": "0x100", + "EventName": "NOP_SPEC", + "BriefDescription": "Speculatively executed, NOP" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/intrinsic.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/intrinsic.json new file mode 100644 index 000000000000..2aecc5c2347d --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/intrinsic.json @@ -0,0 +1,14 @@ +[ + { + "ArchStdEvent": "LDREX_SPEC", + }, + { + "ArchStdEvent": "STREX_PASS_SPEC", + }, + { + "ArchStdEvent": "STREX_FAIL_SPEC", + }, + { + "ArchStdEvent": "STREX_SPEC", + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/memory.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/memory.json new file mode 100644 index 000000000000..08508697b318 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/memory.json @@ -0,0 +1,29 @@ +[ + { + "ArchStdEvent": "MEM_ACCESS_RD", + }, + { + "ArchStdEvent": "MEM_ACCESS_WR", + }, + { + "ArchStdEvent": "UNALIGNED_LD_SPEC", + }, + { + "ArchStdEvent": "UNALIGNED_ST_SPEC", + }, + { + "ArchStdEvent": "UNALIGNED_LDST_SPEC", + }, + { + "PublicDescription": "Data memory access", + "EventCode": "0x13", + "EventName": "MEM_ACCESS", + "BriefDescription": "Memory access" + }, + { + "PublicDescription": "Local memory error. This event counts any correctable or uncorrectable memory error (ECC or parity) in the protected core RAMs", + "EventCode": "0x1a", + "EventName": "MEM_ERROR", + "BriefDescription": "Memory error" + }, +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/emag/pipeline.json b/tools/perf/pmu-events/arch/arm64/ampere/emag/pipeline.json new file mode 100644 index 000000000000..e2087de586bf --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/emag/pipeline.json @@ -0,0 +1,50 @@ +[ + { + "PublicDescription": "Decode starved for instruction cycle", + "EventCode": "0x108", + "EventName": "DECODE_STALL", + "BriefDescription": "Decode starved" + }, + { + "PublicDescription": "Op dispatch stalled cycle", + "EventCode": "0x109", + "EventName": "DISPATCH_STALL", + "BriefDescription": "Dispatch stalled" + }, + { + "PublicDescription": "IXA Op non-issue", + "EventCode": "0x10a", + "EventName": "IXA_STALL", + "BriefDescription": "IXA stalled" + }, + { + "PublicDescription": "IXB Op non-issue", + "EventCode": "0x10b", + "EventName": "IXB_STALL", + "BriefDescription": "IXB stalled" + }, + { + "PublicDescription": "BX Op non-issue", + "EventCode": "0x10c", + "EventName": "BX_STALL", + "BriefDescription": "BX stalled" + }, + { + "PublicDescription": "LX Op non-issue", + "EventCode": "0x10d", + "EventName": "LX_STALL", + "BriefDescription": "LX stalled" + }, + { + "PublicDescription": "SX Op non-issue", + "EventCode": "0x10e", + "EventName": "SX_STALL", + "BriefDescription": "SX stalled" + }, + { + "PublicDescription": "FX Op non-issue", + "EventCode": "0x10f", + "EventName": "FX_STALL", + "BriefDescription": "FX stalled" + }, +] diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json index d40498f2cb1e..635c09fda1d9 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json @@ -188,7 +188,7 @@ "Counter": "0,1,2,3", "EventCode": "0xb", "EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES", - "Filter": "filter_band0=1200", + "Filter": "filter_band0=12", "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_1200mhz_cycles %", "PerPkg": "1", @@ -199,7 +199,7 @@ "Counter": "0,1,2,3", "EventCode": "0xc", "EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES", - "Filter": "filter_band1=2000", + "Filter": "filter_band1=20", "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_2000mhz_cycles %", "PerPkg": "1", @@ -210,7 +210,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd", "EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES", - "Filter": "filter_band2=3000", + "Filter": "filter_band2=30", "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_3000mhz_cycles %", "PerPkg": "1", @@ -221,7 +221,7 @@ "Counter": "0,1,2,3", "EventCode": "0xe", "EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES", - "Filter": "filter_band3=4000", + "Filter": "filter_band3=40", "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_4000mhz_cycles %", "PerPkg": "1", @@ -232,7 +232,7 @@ "Counter": "0,1,2,3", "EventCode": "0xb", "EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band0=1200", + "Filter": "edge=1,filter_band0=12", "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_1200mhz_cycles %", "PerPkg": "1", @@ -243,7 +243,7 @@ "Counter": "0,1,2,3", "EventCode": "0xc", "EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band1=2000", + "Filter": "edge=1,filter_band1=20", "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_2000mhz_cycles %", "PerPkg": "1", @@ -254,7 +254,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd", "EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band2=4000", + "Filter": "edge=1,filter_band2=30", "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_3000mhz_cycles %", "PerPkg": "1", @@ -265,7 +265,7 @@ "Counter": "0,1,2,3", "EventCode": "0xe", "EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band3=4000", + "Filter": "edge=1,filter_band3=40", "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_4000mhz_cycles %", "PerPkg": "1", diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json index 16034bfd06dd..8755693d86c6 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json @@ -187,7 +187,7 @@ "Counter": "0,1,2,3", "EventCode": "0xb", "EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES", - "Filter": "filter_band0=1200", + "Filter": "filter_band0=12", "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_1200mhz_cycles %", "PerPkg": "1", @@ -198,7 +198,7 @@ "Counter": "0,1,2,3", "EventCode": "0xc", "EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES", - "Filter": "filter_band1=2000", + "Filter": "filter_band1=20", "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_2000mhz_cycles %", "PerPkg": "1", @@ -209,7 +209,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd", "EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES", - "Filter": "filter_band2=3000", + "Filter": "filter_band2=30", "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_3000mhz_cycles %", "PerPkg": "1", @@ -220,7 +220,7 @@ "Counter": "0,1,2,3", "EventCode": "0xe", "EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES", - "Filter": "filter_band3=4000", + "Filter": "filter_band3=40", "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_4000mhz_cycles %", "PerPkg": "1", @@ -231,7 +231,7 @@ "Counter": "0,1,2,3", "EventCode": "0xb", "EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band0=1200", + "Filter": "edge=1,filter_band0=12", "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_1200mhz_cycles %", "PerPkg": "1", @@ -242,7 +242,7 @@ "Counter": "0,1,2,3", "EventCode": "0xc", "EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band1=2000", + "Filter": "edge=1,filter_band1=20", "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_2000mhz_cycles %", "PerPkg": "1", @@ -253,7 +253,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd", "EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band2=4000", + "Filter": "edge=1,filter_band2=30", "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_3000mhz_cycles %", "PerPkg": "1", @@ -264,7 +264,7 @@ "Counter": "0,1,2,3", "EventCode": "0xe", "EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS", - "Filter": "edge=1,filter_band3=4000", + "Filter": "edge=1,filter_band3=40", "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.", "MetricName": "freq_ge_4000mhz_cycles %", "PerPkg": "1", diff --git a/tools/perf/scripts/python/call-graph-from-sql.py b/tools/perf/scripts/python/call-graph-from-sql.py deleted file mode 100644 index b494a67a1c67..000000000000 --- a/tools/perf/scripts/python/call-graph-from-sql.py +++ /dev/null @@ -1,339 +0,0 @@ -#!/usr/bin/python2 -# call-graph-from-sql.py: create call-graph from sql database -# Copyright (c) 2014-2017, Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify it -# under the terms and conditions of the GNU General Public License, -# version 2, as published by the Free Software Foundation. -# -# This program is distributed in the hope it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. - -# To use this script you will need to have exported data using either the -# export-to-sqlite.py or the export-to-postgresql.py script. Refer to those -# scripts for details. -# -# Following on from the example in the export scripts, a -# call-graph can be displayed for the pt_example database like this: -# -# python tools/perf/scripts/python/call-graph-from-sql.py pt_example -# -# Note that for PostgreSQL, this script supports connecting to remote databases -# by setting hostname, port, username, password, and dbname e.g. -# -# python tools/perf/scripts/python/call-graph-from-sql.py "hostname=myhost username=myuser password=mypassword dbname=pt_example" -# -# The result is a GUI window with a tree representing a context-sensitive -# call-graph. Expanding a couple of levels of the tree and adjusting column -# widths to suit will display something like: -# -# Call Graph: pt_example -# Call Path Object Count Time(ns) Time(%) Branch Count Branch Count(%) -# v- ls -# v- 2638:2638 -# v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 -# |- unknown unknown 1 13198 0.1 1 0.0 -# >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 -# >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 -# v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 -# >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 -# >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 -# >- __libc_csu_init ls 1 10354 0.1 10 0.0 -# |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 -# v- main ls 1 8182043 99.6 180254 99.9 -# -# Points to note: -# The top level is a command name (comm) -# The next level is a thread (pid:tid) -# Subsequent levels are functions -# 'Count' is the number of calls -# 'Time' is the elapsed time until the function returns -# Percentages are relative to the level above -# 'Branch Count' is the total number of branches for that function and all -# functions that it calls - -import sys -from PySide.QtCore import * -from PySide.QtGui import * -from PySide.QtSql import * -from decimal import * - -class TreeItem(): - - def __init__(self, db, row, parent_item): - self.db = db - self.row = row - self.parent_item = parent_item - self.query_done = False; - self.child_count = 0 - self.child_items = [] - self.data = ["", "", "", "", "", "", ""] - self.comm_id = 0 - self.thread_id = 0 - self.call_path_id = 1 - self.branch_count = 0 - self.time = 0 - if not parent_item: - self.setUpRoot() - - def setUpRoot(self): - self.query_done = True - query = QSqlQuery(self.db) - ret = query.exec_('SELECT id, comm FROM comms') - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - while query.next(): - if not query.value(0): - continue - child_item = TreeItem(self.db, self.child_count, self) - self.child_items.append(child_item) - self.child_count += 1 - child_item.setUpLevel1(query.value(0), query.value(1)) - - def setUpLevel1(self, comm_id, comm): - self.query_done = True; - self.comm_id = comm_id - self.data[0] = comm - self.child_items = [] - self.child_count = 0 - query = QSqlQuery(self.db) - ret = query.exec_('SELECT thread_id, ( SELECT pid FROM threads WHERE id = thread_id ), ( SELECT tid FROM threads WHERE id = thread_id ) FROM comm_threads WHERE comm_id = ' + str(comm_id)) - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - while query.next(): - child_item = TreeItem(self.db, self.child_count, self) - self.child_items.append(child_item) - self.child_count += 1 - child_item.setUpLevel2(comm_id, query.value(0), query.value(1), query.value(2)) - - def setUpLevel2(self, comm_id, thread_id, pid, tid): - self.comm_id = comm_id - self.thread_id = thread_id - self.data[0] = str(pid) + ":" + str(tid) - - def getChildItem(self, row): - return self.child_items[row] - - def getParentItem(self): - return self.parent_item - - def getRow(self): - return self.row - - def timePercent(self, b): - if not self.time: - return "0.0" - x = (b * Decimal(100)) / self.time - return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) - - def branchPercent(self, b): - if not self.branch_count: - return "0.0" - x = (b * Decimal(100)) / self.branch_count - return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) - - def addChild(self, call_path_id, name, dso, count, time, branch_count): - child_item = TreeItem(self.db, self.child_count, self) - child_item.comm_id = self.comm_id - child_item.thread_id = self.thread_id - child_item.call_path_id = call_path_id - child_item.branch_count = branch_count - child_item.time = time - child_item.data[0] = name - if dso == "[kernel.kallsyms]": - dso = "[kernel]" - child_item.data[1] = dso - child_item.data[2] = str(count) - child_item.data[3] = str(time) - child_item.data[4] = self.timePercent(time) - child_item.data[5] = str(branch_count) - child_item.data[6] = self.branchPercent(branch_count) - self.child_items.append(child_item) - self.child_count += 1 - - def selectCalls(self): - self.query_done = True; - query = QSqlQuery(self.db) - ret = query.exec_('SELECT id, call_path_id, branch_count, call_time, return_time, ' - '( SELECT name FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ), ' - '( SELECT short_name FROM dsos WHERE id = ( SELECT dso_id FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ) ), ' - '( SELECT ip FROM call_paths where id = call_path_id ) ' - 'FROM calls WHERE parent_call_path_id = ' + str(self.call_path_id) + ' AND comm_id = ' + str(self.comm_id) + ' AND thread_id = ' + str(self.thread_id) + - ' ORDER BY call_path_id') - if not ret: - raise Exception("Query failed: " + query.lastError().text()) - last_call_path_id = 0 - name = "" - dso = "" - count = 0 - branch_count = 0 - total_branch_count = 0 - time = 0 - total_time = 0 - while query.next(): - if query.value(1) == last_call_path_id: - count += 1 - branch_count += query.value(2) - time += query.value(4) - query.value(3) - else: - if count: - self.addChild(last_call_path_id, name, dso, count, time, branch_count) - last_call_path_id = query.value(1) - name = query.value(5) - dso = query.value(6) - count = 1 - total_branch_count += branch_count - total_time += time - branch_count = query.value(2) - time = query.value(4) - query.value(3) - if count: - self.addChild(last_call_path_id, name, dso, count, time, branch_count) - total_branch_count += branch_count - total_time += time - # Top level does not have time or branch count, so fix that here - if total_branch_count > self.branch_count: - self.branch_count = total_branch_count - if self.branch_count: - for child_item in self.child_items: - child_item.data[6] = self.branchPercent(child_item.branch_count) - if total_time > self.time: - self.time = total_time - if self.time: - for child_item in self.child_items: - child_item.data[4] = self.timePercent(child_item.time) - - def childCount(self): - if not self.query_done: - self.selectCalls() - return self.child_count - - def columnCount(self): - return 7 - - def columnHeader(self, column): - headers = ["Call Path", "Object", "Count ", "Time (ns) ", "Time (%) ", "Branch Count ", "Branch Count (%) "] - return headers[column] - - def getData(self, column): - return self.data[column] - -class TreeModel(QAbstractItemModel): - - def __init__(self, db, parent=None): - super(TreeModel, self).__init__(parent) - self.db = db - self.root = TreeItem(db, 0, None) - - def columnCount(self, parent): - return self.root.columnCount() - - def rowCount(self, parent): - if parent.isValid(): - parent_item = parent.internalPointer() - else: - parent_item = self.root - return parent_item.childCount() - - def headerData(self, section, orientation, role): - if role == Qt.TextAlignmentRole: - if section > 1: - return Qt.AlignRight - if role != Qt.DisplayRole: - return None - if orientation != Qt.Horizontal: - return None - return self.root.columnHeader(section) - - def parent(self, child): - child_item = child.internalPointer() - if child_item is self.root: - return QModelIndex() - parent_item = child_item.getParentItem() - return self.createIndex(parent_item.getRow(), 0, parent_item) - - def index(self, row, column, parent): - if parent.isValid(): - parent_item = parent.internalPointer() - else: - parent_item = self.root - child_item = parent_item.getChildItem(row) - return self.createIndex(row, column, child_item) - - def data(self, index, role): - if role == Qt.TextAlignmentRole: - if index.column() > 1: - return Qt.AlignRight - if role != Qt.DisplayRole: - return None - index_item = index.internalPointer() - return index_item.getData(index.column()) - -class MainWindow(QMainWindow): - - def __init__(self, db, dbname, parent=None): - super(MainWindow, self).__init__(parent) - - self.setObjectName("MainWindow") - self.setWindowTitle("Call Graph: " + dbname) - self.move(100, 100) - self.resize(800, 600) - style = self.style() - icon = style.standardIcon(QStyle.SP_MessageBoxInformation) - self.setWindowIcon(icon); - - self.model = TreeModel(db) - - self.view = QTreeView() - self.view.setModel(self.model) - - self.setCentralWidget(self.view) - -if __name__ == '__main__': - if (len(sys.argv) < 2): - print >> sys.stderr, "Usage is: call-graph-from-sql.py " - raise Exception("Too few arguments") - - dbname = sys.argv[1] - - is_sqlite3 = False - try: - f = open(dbname) - if f.read(15) == "SQLite format 3": - is_sqlite3 = True - f.close() - except: - pass - - if is_sqlite3: - db = QSqlDatabase.addDatabase('QSQLITE') - else: - db = QSqlDatabase.addDatabase('QPSQL') - opts = dbname.split() - for opt in opts: - if '=' in opt: - opt = opt.split('=') - if opt[0] == 'hostname': - db.setHostName(opt[1]) - elif opt[0] == 'port': - db.setPort(int(opt[1])) - elif opt[0] == 'username': - db.setUserName(opt[1]) - elif opt[0] == 'password': - db.setPassword(opt[1]) - elif opt[0] == 'dbname': - dbname = opt[1] - else: - dbname = opt - - db.setDatabaseName(dbname) - if not db.open(): - raise Exception("Failed to open database " + dbname + " error: " + db.lastError().text()) - - app = QApplication(sys.argv) - window = MainWindow(db, dbname) - window.show() - err = app.exec_() - db.close() - sys.exit(err) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index efcaf6cac2eb..0564dd7377f2 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -59,7 +59,7 @@ import datetime # pt_example=# \q # # An example of using the database is provided by the script -# call-graph-from-sql.py. Refer to that script for details. +# exported-sql-viewer.py. Refer to that script for details. # # Tables: # @@ -204,14 +204,23 @@ from ctypes import * libpq = CDLL("libpq.so.5") PQconnectdb = libpq.PQconnectdb PQconnectdb.restype = c_void_p +PQconnectdb.argtypes = [ c_char_p ] PQfinish = libpq.PQfinish +PQfinish.argtypes = [ c_void_p ] PQstatus = libpq.PQstatus +PQstatus.restype = c_int +PQstatus.argtypes = [ c_void_p ] PQexec = libpq.PQexec PQexec.restype = c_void_p +PQexec.argtypes = [ c_void_p, c_char_p ] PQresultStatus = libpq.PQresultStatus +PQresultStatus.restype = c_int +PQresultStatus.argtypes = [ c_void_p ] PQputCopyData = libpq.PQputCopyData +PQputCopyData.restype = c_int PQputCopyData.argtypes = [ c_void_p, c_void_p, c_int ] PQputCopyEnd = libpq.PQputCopyEnd +PQputCopyEnd.restype = c_int PQputCopyEnd.argtypes = [ c_void_p, c_void_p ] sys.path.append(os.environ['PERF_EXEC_PATH'] + \ diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index f827bf77e9d2..245caf2643ed 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -40,7 +40,7 @@ import datetime # sqlite> .quit # # An example of using the database is provided by the script -# call-graph-from-sql.py. Refer to that script for details. +# exported-sql-viewer.py. Refer to that script for details. # # The database structure is practically the same as created by the script # export-to-postgresql.py. Refer to that script for details. A notable @@ -440,7 +440,11 @@ def branch_type_table(*x): def sample_table(*x): if branches: - bind_exec(sample_query, 18, x) + for xx in x[0:15]: + sample_query.addBindValue(str(xx)) + for xx in x[19:22]: + sample_query.addBindValue(str(xx)) + do_query_(sample_query) else: bind_exec(sample_query, 22, x) diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py new file mode 100755 index 000000000000..f278ce5ebab7 --- /dev/null +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -0,0 +1,2615 @@ +#!/usr/bin/python2 +# SPDX-License-Identifier: GPL-2.0 +# exported-sql-viewer.py: view data from sql database +# Copyright (c) 2014-2018, Intel Corporation. + +# To use this script you will need to have exported data using either the +# export-to-sqlite.py or the export-to-postgresql.py script. Refer to those +# scripts for details. +# +# Following on from the example in the export scripts, a +# call-graph can be displayed for the pt_example database like this: +# +# python tools/perf/scripts/python/exported-sql-viewer.py pt_example +# +# Note that for PostgreSQL, this script supports connecting to remote databases +# by setting hostname, port, username, password, and dbname e.g. +# +# python tools/perf/scripts/python/exported-sql-viewer.py "hostname=myhost username=myuser password=mypassword dbname=pt_example" +# +# The result is a GUI window with a tree representing a context-sensitive +# call-graph. Expanding a couple of levels of the tree and adjusting column +# widths to suit will display something like: +# +# Call Graph: pt_example +# Call Path Object Count Time(ns) Time(%) Branch Count Branch Count(%) +# v- ls +# v- 2638:2638 +# v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 +# |- unknown unknown 1 13198 0.1 1 0.0 +# >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 +# >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 +# v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 +# >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 +# >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 +# >- __libc_csu_init ls 1 10354 0.1 10 0.0 +# |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 +# v- main ls 1 8182043 99.6 180254 99.9 +# +# Points to note: +# The top level is a command name (comm) +# The next level is a thread (pid:tid) +# Subsequent levels are functions +# 'Count' is the number of calls +# 'Time' is the elapsed time until the function returns +# Percentages are relative to the level above +# 'Branch Count' is the total number of branches for that function and all +# functions that it calls + +# There is also a "All branches" report, which displays branches and +# possibly disassembly. However, presently, the only supported disassembler is +# Intel XED, and additionally the object code must be present in perf build ID +# cache. To use Intel XED, libxed.so must be present. To build and install +# libxed.so: +# git clone https://github.com/intelxed/mbuild.git mbuild +# git clone https://github.com/intelxed/xed +# cd xed +# ./mfile.py --share +# sudo ./mfile.py --prefix=/usr/local install +# sudo ldconfig +# +# Example report: +# +# Time CPU Command PID TID Branch Type In Tx Branch +# 8107675239590 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea260 _start (ld-2.19.so) +# 7fab593ea260 48 89 e7 mov %rsp, %rdi +# 8107675239899 2 ls 22011 22011 hardware interrupt No 7fab593ea260 _start (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) +# 8107675241900 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea260 _start (ld-2.19.so) +# 7fab593ea260 48 89 e7 mov %rsp, %rdi +# 7fab593ea263 e8 c8 06 00 00 callq 0x7fab593ea930 +# 8107675241900 2 ls 22011 22011 call No 7fab593ea263 _start+0x3 (ld-2.19.so) -> 7fab593ea930 _dl_start (ld-2.19.so) +# 7fab593ea930 55 pushq %rbp +# 7fab593ea931 48 89 e5 mov %rsp, %rbp +# 7fab593ea934 41 57 pushq %r15 +# 7fab593ea936 41 56 pushq %r14 +# 7fab593ea938 41 55 pushq %r13 +# 7fab593ea93a 41 54 pushq %r12 +# 7fab593ea93c 53 pushq %rbx +# 7fab593ea93d 48 89 fb mov %rdi, %rbx +# 7fab593ea940 48 83 ec 68 sub $0x68, %rsp +# 7fab593ea944 0f 31 rdtsc +# 7fab593ea946 48 c1 e2 20 shl $0x20, %rdx +# 7fab593ea94a 89 c0 mov %eax, %eax +# 7fab593ea94c 48 09 c2 or %rax, %rdx +# 7fab593ea94f 48 8b 05 1a 15 22 00 movq 0x22151a(%rip), %rax +# 8107675242232 2 ls 22011 22011 hardware interrupt No 7fab593ea94f _dl_start+0x1f (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) +# 8107675242900 2 ls 22011 22011 return from interrupt No ffffffff86a00a67 native_irq_return_iret ([kernel]) -> 7fab593ea94f _dl_start+0x1f (ld-2.19.so) +# 7fab593ea94f 48 8b 05 1a 15 22 00 movq 0x22151a(%rip), %rax +# 7fab593ea956 48 89 15 3b 13 22 00 movq %rdx, 0x22133b(%rip) +# 8107675243232 2 ls 22011 22011 hardware interrupt No 7fab593ea956 _dl_start+0x26 (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) + +import sys +import weakref +import threading +import string +import cPickle +import re +import os +from PySide.QtCore import * +from PySide.QtGui import * +from PySide.QtSql import * +from decimal import * +from ctypes import * +from multiprocessing import Process, Array, Value, Event + +# Data formatting helpers + +def tohex(ip): + if ip < 0: + ip += 1 << 64 + return "%x" % ip + +def offstr(offset): + if offset: + return "+0x%x" % offset + return "" + +def dsoname(name): + if name == "[kernel.kallsyms]": + return "[kernel]" + return name + +def findnth(s, sub, n, offs=0): + pos = s.find(sub) + if pos < 0: + return pos + if n <= 1: + return offs + pos + return findnth(s[pos + 1:], sub, n - 1, offs + pos + 1) + +# Percent to one decimal place + +def PercentToOneDP(n, d): + if not d: + return "0.0" + x = (n * Decimal(100)) / d + return str(x.quantize(Decimal(".1"), rounding=ROUND_HALF_UP)) + +# Helper for queries that must not fail + +def QueryExec(query, stmt): + ret = query.exec_(stmt) + if not ret: + raise Exception("Query failed: " + query.lastError().text()) + +# Background thread + +class Thread(QThread): + + done = Signal(object) + + def __init__(self, task, param=None, parent=None): + super(Thread, self).__init__(parent) + self.task = task + self.param = param + + def run(self): + while True: + if self.param is None: + done, result = self.task() + else: + done, result = self.task(self.param) + self.done.emit(result) + if done: + break + +# Tree data model + +class TreeModel(QAbstractItemModel): + + def __init__(self, root, parent=None): + super(TreeModel, self).__init__(parent) + self.root = root + self.last_row_read = 0 + + def Item(self, parent): + if parent.isValid(): + return parent.internalPointer() + else: + return self.root + + def rowCount(self, parent): + result = self.Item(parent).childCount() + if result < 0: + result = 0 + self.dataChanged.emit(parent, parent) + return result + + def hasChildren(self, parent): + return self.Item(parent).hasChildren() + + def headerData(self, section, orientation, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(section) + if role != Qt.DisplayRole: + return None + if orientation != Qt.Horizontal: + return None + return self.columnHeader(section) + + def parent(self, child): + child_item = child.internalPointer() + if child_item is self.root: + return QModelIndex() + parent_item = child_item.getParentItem() + return self.createIndex(parent_item.getRow(), 0, parent_item) + + def index(self, row, column, parent): + child_item = self.Item(parent).getChildItem(row) + return self.createIndex(row, column, child_item) + + def DisplayData(self, item, index): + return item.getData(index.column()) + + def FetchIfNeeded(self, row): + if row > self.last_row_read: + self.last_row_read = row + if row + 10 >= self.root.child_count: + self.fetcher.Fetch(glb_chunk_sz) + + def columnAlignment(self, column): + return Qt.AlignLeft + + def columnFont(self, column): + return None + + def data(self, index, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(index.column()) + if role == Qt.FontRole: + return self.columnFont(index.column()) + if role != Qt.DisplayRole: + return None + item = index.internalPointer() + return self.DisplayData(item, index) + +# Table data model + +class TableModel(QAbstractTableModel): + + def __init__(self, parent=None): + super(TableModel, self).__init__(parent) + self.child_count = 0 + self.child_items = [] + self.last_row_read = 0 + + def Item(self, parent): + if parent.isValid(): + return parent.internalPointer() + else: + return self + + def rowCount(self, parent): + return self.child_count + + def headerData(self, section, orientation, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(section) + if role != Qt.DisplayRole: + return None + if orientation != Qt.Horizontal: + return None + return self.columnHeader(section) + + def index(self, row, column, parent): + return self.createIndex(row, column, self.child_items[row]) + + def DisplayData(self, item, index): + return item.getData(index.column()) + + def FetchIfNeeded(self, row): + if row > self.last_row_read: + self.last_row_read = row + if row + 10 >= self.child_count: + self.fetcher.Fetch(glb_chunk_sz) + + def columnAlignment(self, column): + return Qt.AlignLeft + + def columnFont(self, column): + return None + + def data(self, index, role): + if role == Qt.TextAlignmentRole: + return self.columnAlignment(index.column()) + if role == Qt.FontRole: + return self.columnFont(index.column()) + if role != Qt.DisplayRole: + return None + item = index.internalPointer() + return self.DisplayData(item, index) + +# Model cache + +model_cache = weakref.WeakValueDictionary() +model_cache_lock = threading.Lock() + +def LookupCreateModel(model_name, create_fn): + model_cache_lock.acquire() + try: + model = model_cache[model_name] + except: + model = None + if model is None: + model = create_fn() + model_cache[model_name] = model + model_cache_lock.release() + return model + +# Find bar + +class FindBar(): + + def __init__(self, parent, finder, is_reg_expr=False): + self.finder = finder + self.context = [] + self.last_value = None + self.last_pattern = None + + label = QLabel("Find:") + label.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.textbox = QComboBox() + self.textbox.setEditable(True) + self.textbox.currentIndexChanged.connect(self.ValueChanged) + + self.progress = QProgressBar() + self.progress.setRange(0, 0) + self.progress.hide() + + if is_reg_expr: + self.pattern = QCheckBox("Regular Expression") + else: + self.pattern = QCheckBox("Pattern") + self.pattern.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.next_button = QToolButton() + self.next_button.setIcon(parent.style().standardIcon(QStyle.SP_ArrowDown)) + self.next_button.released.connect(lambda: self.NextPrev(1)) + + self.prev_button = QToolButton() + self.prev_button.setIcon(parent.style().standardIcon(QStyle.SP_ArrowUp)) + self.prev_button.released.connect(lambda: self.NextPrev(-1)) + + self.close_button = QToolButton() + self.close_button.setIcon(parent.style().standardIcon(QStyle.SP_DockWidgetCloseButton)) + self.close_button.released.connect(self.Deactivate) + + self.hbox = QHBoxLayout() + self.hbox.setContentsMargins(0, 0, 0, 0) + + self.hbox.addWidget(label) + self.hbox.addWidget(self.textbox) + self.hbox.addWidget(self.progress) + self.hbox.addWidget(self.pattern) + self.hbox.addWidget(self.next_button) + self.hbox.addWidget(self.prev_button) + self.hbox.addWidget(self.close_button) + + self.bar = QWidget() + self.bar.setLayout(self.hbox); + self.bar.hide() + + def Widget(self): + return self.bar + + def Activate(self): + self.bar.show() + self.textbox.setFocus() + + def Deactivate(self): + self.bar.hide() + + def Busy(self): + self.textbox.setEnabled(False) + self.pattern.hide() + self.next_button.hide() + self.prev_button.hide() + self.progress.show() + + def Idle(self): + self.textbox.setEnabled(True) + self.progress.hide() + self.pattern.show() + self.next_button.show() + self.prev_button.show() + + def Find(self, direction): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + self.last_value = value + self.last_pattern = pattern + self.finder.Find(value, direction, pattern, self.context) + + def ValueChanged(self): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + index = self.textbox.currentIndex() + data = self.textbox.itemData(index) + # Store the pattern in the combo box to keep it with the text value + if data == None: + self.textbox.setItemData(index, pattern) + else: + self.pattern.setChecked(data) + self.Find(0) + + def NextPrev(self, direction): + value = self.textbox.currentText() + pattern = self.pattern.isChecked() + if value != self.last_value: + index = self.textbox.findText(value) + # Allow for a button press before the value has been added to the combo box + if index < 0: + index = self.textbox.count() + self.textbox.addItem(value, pattern) + self.textbox.setCurrentIndex(index) + return + else: + self.textbox.setItemData(index, pattern) + elif pattern != self.last_pattern: + # Keep the pattern recorded in the combo box up to date + index = self.textbox.currentIndex() + self.textbox.setItemData(index, pattern) + self.Find(direction) + + def NotFound(self): + QMessageBox.information(self.bar, "Find", "'" + self.textbox.currentText() + "' not found") + +# Context-sensitive call graph data model item base + +class CallGraphLevelItemBase(object): + + def __init__(self, glb, row, parent_item): + self.glb = glb + self.row = row + self.parent_item = parent_item + self.query_done = False; + self.child_count = 0 + self.child_items = [] + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def childCount(self): + if not self.query_done: + self.Select() + if not self.child_count: + return -1 + return self.child_count + + def hasChildren(self): + if not self.query_done: + return True + return self.child_count > 0 + + def getData(self, column): + return self.data[column] + +# Context-sensitive call graph data model level 2+ item base + +class CallGraphLevelTwoPlusItemBase(CallGraphLevelItemBase): + + def __init__(self, glb, row, comm_id, thread_id, call_path_id, time, branch_count, parent_item): + super(CallGraphLevelTwoPlusItemBase, self).__init__(glb, row, parent_item) + self.comm_id = comm_id + self.thread_id = thread_id + self.call_path_id = call_path_id + self.branch_count = branch_count + self.time = time + + def Select(self): + self.query_done = True; + query = QSqlQuery(self.glb.db) + QueryExec(query, "SELECT call_path_id, name, short_name, COUNT(calls.id), SUM(return_time - call_time), SUM(branch_count)" + " FROM calls" + " INNER JOIN call_paths ON calls.call_path_id = call_paths.id" + " INNER JOIN symbols ON call_paths.symbol_id = symbols.id" + " INNER JOIN dsos ON symbols.dso_id = dsos.id" + " WHERE parent_call_path_id = " + str(self.call_path_id) + + " AND comm_id = " + str(self.comm_id) + + " AND thread_id = " + str(self.thread_id) + + " GROUP BY call_path_id, name, short_name" + " ORDER BY call_path_id") + while query.next(): + child_item = CallGraphLevelThreeItem(self.glb, self.child_count, self.comm_id, self.thread_id, query.value(0), query.value(1), query.value(2), query.value(3), int(query.value(4)), int(query.value(5)), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model level three item + +class CallGraphLevelThreeItem(CallGraphLevelTwoPlusItemBase): + + def __init__(self, glb, row, comm_id, thread_id, call_path_id, name, dso, count, time, branch_count, parent_item): + super(CallGraphLevelThreeItem, self).__init__(glb, row, comm_id, thread_id, call_path_id, time, branch_count, parent_item) + dso = dsoname(dso) + self.data = [ name, dso, str(count), str(time), PercentToOneDP(time, parent_item.time), str(branch_count), PercentToOneDP(branch_count, parent_item.branch_count) ] + self.dbid = call_path_id + +# Context-sensitive call graph data model level two item + +class CallGraphLevelTwoItem(CallGraphLevelTwoPlusItemBase): + + def __init__(self, glb, row, comm_id, thread_id, pid, tid, parent_item): + super(CallGraphLevelTwoItem, self).__init__(glb, row, comm_id, thread_id, 1, 0, 0, parent_item) + self.data = [str(pid) + ":" + str(tid), "", "", "", "", "", ""] + self.dbid = thread_id + + def Select(self): + super(CallGraphLevelTwoItem, self).Select() + for child_item in self.child_items: + self.time += child_item.time + self.branch_count += child_item.branch_count + for child_item in self.child_items: + child_item.data[4] = PercentToOneDP(child_item.time, self.time) + child_item.data[6] = PercentToOneDP(child_item.branch_count, self.branch_count) + +# Context-sensitive call graph data model level one item + +class CallGraphLevelOneItem(CallGraphLevelItemBase): + + def __init__(self, glb, row, comm_id, comm, parent_item): + super(CallGraphLevelOneItem, self).__init__(glb, row, parent_item) + self.data = [comm, "", "", "", "", "", ""] + self.dbid = comm_id + + def Select(self): + self.query_done = True; + query = QSqlQuery(self.glb.db) + QueryExec(query, "SELECT thread_id, pid, tid" + " FROM comm_threads" + " INNER JOIN threads ON thread_id = threads.id" + " WHERE comm_id = " + str(self.dbid)) + while query.next(): + child_item = CallGraphLevelTwoItem(self.glb, self.child_count, self.dbid, query.value(0), query.value(1), query.value(2), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model root item + +class CallGraphRootItem(CallGraphLevelItemBase): + + def __init__(self, glb): + super(CallGraphRootItem, self).__init__(glb, 0, None) + self.dbid = 0 + self.query_done = True; + query = QSqlQuery(glb.db) + QueryExec(query, "SELECT id, comm FROM comms") + while query.next(): + if not query.value(0): + continue + child_item = CallGraphLevelOneItem(glb, self.child_count, query.value(0), query.value(1), self) + self.child_items.append(child_item) + self.child_count += 1 + +# Context-sensitive call graph data model + +class CallGraphModel(TreeModel): + + def __init__(self, glb, parent=None): + super(CallGraphModel, self).__init__(CallGraphRootItem(glb), parent) + self.glb = glb + + def columnCount(self, parent=None): + return 7 + + def columnHeader(self, column): + headers = ["Call Path", "Object", "Count ", "Time (ns) ", "Time (%) ", "Branch Count ", "Branch Count (%) "] + return headers[column] + + def columnAlignment(self, column): + alignment = [ Qt.AlignLeft, Qt.AlignLeft, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight, Qt.AlignRight ] + return alignment[column] + + def FindSelect(self, value, pattern, query): + if pattern: + # postgresql and sqlite pattern patching differences: + # postgresql LIKE is case sensitive but sqlite LIKE is not + # postgresql LIKE allows % and _ to be escaped with \ but sqlite LIKE does not + # postgresql supports ILIKE which is case insensitive + # sqlite supports GLOB (text only) which uses * and ? and is case sensitive + if not self.glb.dbref.is_sqlite3: + # Escape % and _ + s = value.replace("%", "\%") + s = s.replace("_", "\_") + # Translate * and ? into SQL LIKE pattern characters % and _ + trans = string.maketrans("*?", "%_") + match = " LIKE '" + str(s).translate(trans) + "'" + else: + match = " GLOB '" + str(value) + "'" + else: + match = " = '" + str(value) + "'" + QueryExec(query, "SELECT call_path_id, comm_id, thread_id" + " FROM calls" + " INNER JOIN call_paths ON calls.call_path_id = call_paths.id" + " INNER JOIN symbols ON call_paths.symbol_id = symbols.id" + " WHERE symbols.name" + match + + " GROUP BY comm_id, thread_id, call_path_id" + " ORDER BY comm_id, thread_id, call_path_id") + + def FindPath(self, query): + # Turn the query result into a list of ids that the tree view can walk + # to open the tree at the right place. + ids = [] + parent_id = query.value(0) + while parent_id: + ids.insert(0, parent_id) + q2 = QSqlQuery(self.glb.db) + QueryExec(q2, "SELECT parent_id" + " FROM call_paths" + " WHERE id = " + str(parent_id)) + if not q2.next(): + break + parent_id = q2.value(0) + # The call path root is not used + if ids[0] == 1: + del ids[0] + ids.insert(0, query.value(2)) + ids.insert(0, query.value(1)) + return ids + + def Found(self, query, found): + if found: + return self.FindPath(query) + return [] + + def FindValue(self, value, pattern, query, last_value, last_pattern): + if last_value == value and pattern == last_pattern: + found = query.first() + else: + self.FindSelect(value, pattern, query) + found = query.next() + return self.Found(query, found) + + def FindNext(self, query): + found = query.next() + if not found: + found = query.first() + return self.Found(query, found) + + def FindPrev(self, query): + found = query.previous() + if not found: + found = query.last() + return self.Found(query, found) + + def FindThread(self, c): + if c.direction == 0 or c.value != c.last_value or c.pattern != c.last_pattern: + ids = self.FindValue(c.value, c.pattern, c.query, c.last_value, c.last_pattern) + elif c.direction > 0: + ids = self.FindNext(c.query) + else: + ids = self.FindPrev(c.query) + return (True, ids) + + def Find(self, value, direction, pattern, context, callback): + class Context(): + def __init__(self, *x): + self.value, self.direction, self.pattern, self.query, self.last_value, self.last_pattern = x + def Update(self, *x): + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = x + (self.value, self.pattern) + if len(context): + context[0].Update(value, direction, pattern) + else: + context.append(Context(value, direction, pattern, QSqlQuery(self.glb.db), None, None)) + # Use a thread so the UI is not blocked during the SELECT + thread = Thread(self.FindThread, context[0]) + thread.done.connect(lambda ids, t=thread, c=callback: self.FindDone(t, c, ids), Qt.QueuedConnection) + thread.start() + + def FindDone(self, thread, callback, ids): + callback(ids) + +# Vertical widget layout + +class VBox(): + + def __init__(self, w1, w2, w3=None): + self.vbox = QWidget() + self.vbox.setLayout(QVBoxLayout()); + + self.vbox.layout().setContentsMargins(0, 0, 0, 0) + + self.vbox.layout().addWidget(w1) + self.vbox.layout().addWidget(w2) + if w3: + self.vbox.layout().addWidget(w3) + + def Widget(self): + return self.vbox + +# Context-sensitive call graph window + +class CallGraphWindow(QMdiSubWindow): + + def __init__(self, glb, parent=None): + super(CallGraphWindow, self).__init__(parent) + + self.model = LookupCreateModel("Context-Sensitive Call Graph", lambda x=glb: CallGraphModel(x)) + + self.view = QTreeView() + self.view.setModel(self.model) + + for c, w in ((0, 250), (1, 100), (2, 60), (3, 70), (4, 70), (5, 100)): + self.view.setColumnWidth(c, w) + + self.find_bar = FindBar(self, self) + + self.vbox = VBox(self.view, self.find_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, "Context-Sensitive Call Graph") + + def DisplayFound(self, ids): + if not len(ids): + return False + parent = QModelIndex() + for dbid in ids: + found = False + n = self.model.rowCount(parent) + for row in xrange(n): + child = self.model.index(row, 0, parent) + if child.internalPointer().dbid == dbid: + found = True + self.view.setCurrentIndex(child) + parent = child + break + if not found: + break + return found + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.model.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, ids): + found = True + if not self.DisplayFound(ids): + found = False + self.find_bar.Idle() + if not found: + self.find_bar.NotFound() + +# Child data item finder + +class ChildDataItemFinder(): + + def __init__(self, root): + self.root = root + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = (None,) * 5 + self.rows = [] + self.pos = 0 + + def FindSelect(self): + self.rows = [] + if self.pattern: + pattern = re.compile(self.value) + for child in self.root.child_items: + for column_data in child.data: + if re.search(pattern, str(column_data)) is not None: + self.rows.append(child.row) + break + else: + for child in self.root.child_items: + for column_data in child.data: + if self.value in str(column_data): + self.rows.append(child.row) + break + + def FindValue(self): + self.pos = 0 + if self.last_value != self.value or self.pattern != self.last_pattern: + self.FindSelect() + if not len(self.rows): + return -1 + return self.rows[self.pos] + + def FindThread(self): + if self.direction == 0 or self.value != self.last_value or self.pattern != self.last_pattern: + row = self.FindValue() + elif len(self.rows): + if self.direction > 0: + self.pos += 1 + if self.pos >= len(self.rows): + self.pos = 0 + else: + self.pos -= 1 + if self.pos < 0: + self.pos = len(self.rows) - 1 + row = self.rows[self.pos] + else: + row = -1 + return (True, row) + + def Find(self, value, direction, pattern, context, callback): + self.value, self.direction, self.pattern, self.last_value, self.last_pattern = (value, direction,pattern, self.value, self.pattern) + # Use a thread so the UI is not blocked + thread = Thread(self.FindThread) + thread.done.connect(lambda row, t=thread, c=callback: self.FindDone(t, c, row), Qt.QueuedConnection) + thread.start() + + def FindDone(self, thread, callback, row): + callback(row) + +# Number of database records to fetch in one go + +glb_chunk_sz = 10000 + +# size of pickled integer big enough for record size + +glb_nsz = 8 + +# Background process for SQL data fetcher + +class SQLFetcherProcess(): + + def __init__(self, dbref, sql, buffer, head, tail, fetch_count, fetching_done, process_target, wait_event, fetched_event, prep): + # Need a unique connection name + conn_name = "SQLFetcher" + str(os.getpid()) + self.db, dbname = dbref.Open(conn_name) + self.sql = sql + self.buffer = buffer + self.head = head + self.tail = tail + self.fetch_count = fetch_count + self.fetching_done = fetching_done + self.process_target = process_target + self.wait_event = wait_event + self.fetched_event = fetched_event + self.prep = prep + self.query = QSqlQuery(self.db) + self.query_limit = 0 if "$$last_id$$" in sql else 2 + self.last_id = -1 + self.fetched = 0 + self.more = True + self.local_head = self.head.value + self.local_tail = self.tail.value + + def Select(self): + if self.query_limit: + if self.query_limit == 1: + return + self.query_limit -= 1 + stmt = self.sql.replace("$$last_id$$", str(self.last_id)) + QueryExec(self.query, stmt) + + def Next(self): + if not self.query.next(): + self.Select() + if not self.query.next(): + return None + self.last_id = self.query.value(0) + return self.prep(self.query) + + def WaitForTarget(self): + while True: + self.wait_event.clear() + target = self.process_target.value + if target > self.fetched or target < 0: + break + self.wait_event.wait() + return target + + def HasSpace(self, sz): + if self.local_tail <= self.local_head: + space = len(self.buffer) - self.local_head + if space > sz: + return True + if space >= glb_nsz: + # Use 0 (or space < glb_nsz) to mean there is no more at the top of the buffer + nd = cPickle.dumps(0, cPickle.HIGHEST_PROTOCOL) + self.buffer[self.local_head : self.local_head + len(nd)] = nd + self.local_head = 0 + if self.local_tail - self.local_head > sz: + return True + return False + + def WaitForSpace(self, sz): + if self.HasSpace(sz): + return + while True: + self.wait_event.clear() + self.local_tail = self.tail.value + if self.HasSpace(sz): + return + self.wait_event.wait() + + def AddToBuffer(self, obj): + d = cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL) + n = len(d) + nd = cPickle.dumps(n, cPickle.HIGHEST_PROTOCOL) + sz = n + glb_nsz + self.WaitForSpace(sz) + pos = self.local_head + self.buffer[pos : pos + len(nd)] = nd + self.buffer[pos + glb_nsz : pos + sz] = d + self.local_head += sz + + def FetchBatch(self, batch_size): + fetched = 0 + while batch_size > fetched: + obj = self.Next() + if obj is None: + self.more = False + break + self.AddToBuffer(obj) + fetched += 1 + if fetched: + self.fetched += fetched + with self.fetch_count.get_lock(): + self.fetch_count.value += fetched + self.head.value = self.local_head + self.fetched_event.set() + + def Run(self): + while self.more: + target = self.WaitForTarget() + if target < 0: + break + batch_size = min(glb_chunk_sz, target - self.fetched) + self.FetchBatch(batch_size) + self.fetching_done.value = True + self.fetched_event.set() + +def SQLFetcherFn(*x): + process = SQLFetcherProcess(*x) + process.Run() + +# SQL data fetcher + +class SQLFetcher(QObject): + + done = Signal(object) + + def __init__(self, glb, sql, prep, process_data, parent=None): + super(SQLFetcher, self).__init__(parent) + self.process_data = process_data + self.more = True + self.target = 0 + self.last_target = 0 + self.fetched = 0 + self.buffer_size = 16 * 1024 * 1024 + self.buffer = Array(c_char, self.buffer_size, lock=False) + self.head = Value(c_longlong) + self.tail = Value(c_longlong) + self.local_tail = 0 + self.fetch_count = Value(c_longlong) + self.fetching_done = Value(c_bool) + self.last_count = 0 + self.process_target = Value(c_longlong) + self.wait_event = Event() + self.fetched_event = Event() + glb.AddInstanceToShutdownOnExit(self) + self.process = Process(target=SQLFetcherFn, args=(glb.dbref, sql, self.buffer, self.head, self.tail, self.fetch_count, self.fetching_done, self.process_target, self.wait_event, self.fetched_event, prep)) + self.process.start() + self.thread = Thread(self.Thread) + self.thread.done.connect(self.ProcessData, Qt.QueuedConnection) + self.thread.start() + + def Shutdown(self): + # Tell the thread and process to exit + self.process_target.value = -1 + self.wait_event.set() + self.more = False + self.fetching_done.value = True + self.fetched_event.set() + + def Thread(self): + if not self.more: + return True, 0 + while True: + self.fetched_event.clear() + fetch_count = self.fetch_count.value + if fetch_count != self.last_count: + break + if self.fetching_done.value: + self.more = False + return True, 0 + self.fetched_event.wait() + count = fetch_count - self.last_count + self.last_count = fetch_count + self.fetched += count + return False, count + + def Fetch(self, nr): + if not self.more: + # -1 inidcates there are no more + return -1 + result = self.fetched + extra = result + nr - self.target + if extra > 0: + self.target += extra + # process_target < 0 indicates shutting down + if self.process_target.value >= 0: + self.process_target.value = self.target + self.wait_event.set() + return result + + def RemoveFromBuffer(self): + pos = self.local_tail + if len(self.buffer) - pos < glb_nsz: + pos = 0 + n = cPickle.loads(self.buffer[pos : pos + glb_nsz]) + if n == 0: + pos = 0 + n = cPickle.loads(self.buffer[0 : glb_nsz]) + pos += glb_nsz + obj = cPickle.loads(self.buffer[pos : pos + n]) + self.local_tail = pos + n + return obj + + def ProcessData(self, count): + for i in xrange(count): + obj = self.RemoveFromBuffer() + self.process_data(obj) + self.tail.value = self.local_tail + self.wait_event.set() + self.done.emit(count) + +# Fetch more records bar + +class FetchMoreRecordsBar(): + + def __init__(self, model, parent): + self.model = model + + self.label = QLabel("Number of records (x " + "{:,}".format(glb_chunk_sz) + ") to fetch:") + self.label.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.fetch_count = QSpinBox() + self.fetch_count.setRange(1, 1000000) + self.fetch_count.setValue(10) + self.fetch_count.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.fetch = QPushButton("Go!") + self.fetch.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + self.fetch.released.connect(self.FetchMoreRecords) + + self.progress = QProgressBar() + self.progress.setRange(0, 100) + self.progress.hide() + + self.done_label = QLabel("All records fetched") + self.done_label.hide() + + self.spacer = QLabel("") + + self.close_button = QToolButton() + self.close_button.setIcon(parent.style().standardIcon(QStyle.SP_DockWidgetCloseButton)) + self.close_button.released.connect(self.Deactivate) + + self.hbox = QHBoxLayout() + self.hbox.setContentsMargins(0, 0, 0, 0) + + self.hbox.addWidget(self.label) + self.hbox.addWidget(self.fetch_count) + self.hbox.addWidget(self.fetch) + self.hbox.addWidget(self.spacer) + self.hbox.addWidget(self.progress) + self.hbox.addWidget(self.done_label) + self.hbox.addWidget(self.close_button) + + self.bar = QWidget() + self.bar.setLayout(self.hbox); + self.bar.show() + + self.in_progress = False + self.model.progress.connect(self.Progress) + + self.done = False + + if not model.HasMoreRecords(): + self.Done() + + def Widget(self): + return self.bar + + def Activate(self): + self.bar.show() + self.fetch.setFocus() + + def Deactivate(self): + self.bar.hide() + + def Enable(self, enable): + self.fetch.setEnabled(enable) + self.fetch_count.setEnabled(enable) + + def Busy(self): + self.Enable(False) + self.fetch.hide() + self.spacer.hide() + self.progress.show() + + def Idle(self): + self.in_progress = False + self.Enable(True) + self.progress.hide() + self.fetch.show() + self.spacer.show() + + def Target(self): + return self.fetch_count.value() * glb_chunk_sz + + def Done(self): + self.done = True + self.Idle() + self.label.hide() + self.fetch_count.hide() + self.fetch.hide() + self.spacer.hide() + self.done_label.show() + + def Progress(self, count): + if self.in_progress: + if count: + percent = ((count - self.start) * 100) / self.Target() + if percent >= 100: + self.Idle() + else: + self.progress.setValue(percent) + if not count: + # Count value of zero means no more records + self.Done() + + def FetchMoreRecords(self): + if self.done: + return + self.progress.setValue(0) + self.Busy() + self.in_progress = True + self.start = self.model.FetchMoreRecords(self.Target()) + +# Brance data model level two item + +class BranchLevelTwoItem(): + + def __init__(self, row, text, parent_item): + self.row = row + self.parent_item = parent_item + self.data = [""] * 8 + self.data[7] = text + self.level = 2 + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def childCount(self): + return 0 + + def hasChildren(self): + return False + + def getData(self, column): + return self.data[column] + +# Brance data model level one item + +class BranchLevelOneItem(): + + def __init__(self, glb, row, data, parent_item): + self.glb = glb + self.row = row + self.parent_item = parent_item + self.child_count = 0 + self.child_items = [] + self.data = data[1:] + self.dbid = data[0] + self.level = 1 + self.query_done = False + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def Select(self): + self.query_done = True + + if not self.glb.have_disassembler: + return + + query = QSqlQuery(self.glb.db) + + QueryExec(query, "SELECT cpu, to_dso_id, to_symbol_id, to_sym_offset, short_name, long_name, build_id, sym_start, to_ip" + " FROM samples" + " INNER JOIN dsos ON samples.to_dso_id = dsos.id" + " INNER JOIN symbols ON samples.to_symbol_id = symbols.id" + " WHERE samples.id = " + str(self.dbid)) + if not query.next(): + return + cpu = query.value(0) + dso = query.value(1) + sym = query.value(2) + if dso == 0 or sym == 0: + return + off = query.value(3) + short_name = query.value(4) + long_name = query.value(5) + build_id = query.value(6) + sym_start = query.value(7) + ip = query.value(8) + + QueryExec(query, "SELECT samples.dso_id, symbol_id, sym_offset, sym_start" + " FROM samples" + " INNER JOIN symbols ON samples.symbol_id = symbols.id" + " WHERE samples.id > " + str(self.dbid) + " AND cpu = " + str(cpu) + + " ORDER BY samples.id" + " LIMIT 1") + if not query.next(): + return + if query.value(0) != dso: + # Cannot disassemble from one dso to another + return + bsym = query.value(1) + boff = query.value(2) + bsym_start = query.value(3) + if bsym == 0: + return + tot = bsym_start + boff + 1 - sym_start - off + if tot <= 0 or tot > 16384: + return + + inst = self.glb.disassembler.Instruction() + f = self.glb.FileFromNamesAndBuildId(short_name, long_name, build_id) + if not f: + return + mode = 0 if Is64Bit(f) else 1 + self.glb.disassembler.SetMode(inst, mode) + + buf_sz = tot + 16 + buf = create_string_buffer(tot + 16) + f.seek(sym_start + off) + buf.value = f.read(buf_sz) + buf_ptr = addressof(buf) + i = 0 + while tot > 0: + cnt, text = self.glb.disassembler.DisassembleOne(inst, buf_ptr, buf_sz, ip) + if cnt: + byte_str = tohex(ip).rjust(16) + for k in xrange(cnt): + byte_str += " %02x" % ord(buf[i]) + i += 1 + while k < 15: + byte_str += " " + k += 1 + self.child_items.append(BranchLevelTwoItem(0, byte_str + " " + text, self)) + self.child_count += 1 + else: + return + buf_ptr += cnt + tot -= cnt + buf_sz -= cnt + ip += cnt + + def childCount(self): + if not self.query_done: + self.Select() + if not self.child_count: + return -1 + return self.child_count + + def hasChildren(self): + if not self.query_done: + return True + return self.child_count > 0 + + def getData(self, column): + return self.data[column] + +# Brance data model root item + +class BranchRootItem(): + + def __init__(self): + self.child_count = 0 + self.child_items = [] + self.level = 0 + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return None + + def getRow(self): + return 0 + + def childCount(self): + return self.child_count + + def hasChildren(self): + return self.child_count > 0 + + def getData(self, column): + return "" + +# Branch data preparation + +def BranchDataPrep(query): + data = [] + for i in xrange(0, 8): + data.append(query.value(i)) + data.append(tohex(query.value(8)).rjust(16) + " " + query.value(9) + offstr(query.value(10)) + + " (" + dsoname(query.value(11)) + ")" + " -> " + + tohex(query.value(12)) + " " + query.value(13) + offstr(query.value(14)) + + " (" + dsoname(query.value(15)) + ")") + return data + +# Branch data model + +class BranchModel(TreeModel): + + progress = Signal(object) + + def __init__(self, glb, event_id, where_clause, parent=None): + super(BranchModel, self).__init__(BranchRootItem(), parent) + self.glb = glb + self.event_id = event_id + self.more = True + self.populated = 0 + sql = ("SELECT samples.id, time, cpu, comm, pid, tid, branch_types.name," + " CASE WHEN in_tx = '0' THEN 'No' ELSE 'Yes' END," + " ip, symbols.name, sym_offset, dsos.short_name," + " to_ip, to_symbols.name, to_sym_offset, to_dsos.short_name" + " FROM samples" + " INNER JOIN comms ON comm_id = comms.id" + " INNER JOIN threads ON thread_id = threads.id" + " INNER JOIN branch_types ON branch_type = branch_types.id" + " INNER JOIN symbols ON symbol_id = symbols.id" + " INNER JOIN symbols to_symbols ON to_symbol_id = to_symbols.id" + " INNER JOIN dsos ON samples.dso_id = dsos.id" + " INNER JOIN dsos AS to_dsos ON samples.to_dso_id = to_dsos.id" + " WHERE samples.id > $$last_id$$" + where_clause + + " AND evsel_id = " + str(self.event_id) + + " ORDER BY samples.id" + " LIMIT " + str(glb_chunk_sz)) + self.fetcher = SQLFetcher(glb, sql, BranchDataPrep, self.AddSample) + self.fetcher.done.connect(self.Update) + self.fetcher.Fetch(glb_chunk_sz) + + def columnCount(self, parent=None): + return 8 + + def columnHeader(self, column): + return ("Time", "CPU", "Command", "PID", "TID", "Branch Type", "In Tx", "Branch")[column] + + def columnFont(self, column): + if column != 7: + return None + return QFont("Monospace") + + def DisplayData(self, item, index): + if item.level == 1: + self.FetchIfNeeded(item.row) + return item.getData(index.column()) + + def AddSample(self, data): + child = BranchLevelOneItem(self.glb, self.populated, data, self.root) + self.root.child_items.append(child) + self.populated += 1 + + def Update(self, fetched): + if not fetched: + self.more = False + self.progress.emit(0) + child_count = self.root.child_count + count = self.populated - child_count + if count > 0: + parent = QModelIndex() + self.beginInsertRows(parent, child_count, child_count + count - 1) + self.insertRows(child_count, count, parent) + self.root.child_count += count + self.endInsertRows() + self.progress.emit(self.root.child_count) + + def FetchMoreRecords(self, count): + current = self.root.child_count + if self.more: + self.fetcher.Fetch(count) + else: + self.progress.emit(0) + return current + + def HasMoreRecords(self): + return self.more + +# Branch window + +class BranchWindow(QMdiSubWindow): + + def __init__(self, glb, event_id, name, where_clause, parent=None): + super(BranchWindow, self).__init__(parent) + + model_name = "Branch Events " + str(event_id) + if len(where_clause): + model_name = where_clause + " " + model_name + + self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, where_clause)) + + self.view = QTreeView() + self.view.setUniformRowHeights(True) + self.view.setModel(self.model) + + self.ResizeColumnsToContents() + + self.find_bar = FindBar(self, self, True) + + self.finder = ChildDataItemFinder(self.model.root) + + self.fetch_bar = FetchMoreRecordsBar(self.model, self) + + self.vbox = VBox(self.view, self.find_bar.Widget(), self.fetch_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, name + " Branch Events") + + def ResizeColumnToContents(self, column, n): + # Using the view's resizeColumnToContents() here is extrememly slow + # so implement a crude alternative + mm = "MM" if column else "MMMM" + font = self.view.font() + metrics = QFontMetrics(font) + max = 0 + for row in xrange(n): + val = self.model.root.child_items[row].data[column] + len = metrics.width(str(val) + mm) + max = len if len > max else max + val = self.model.columnHeader(column) + len = metrics.width(str(val) + mm) + max = len if len > max else max + self.view.setColumnWidth(column, max) + + def ResizeColumnsToContents(self): + n = min(self.model.root.child_count, 100) + if n < 1: + # No data yet, so connect a signal to notify when there is + self.model.rowsInserted.connect(self.UpdateColumnWidths) + return + columns = self.model.columnCount() + for i in xrange(columns): + self.ResizeColumnToContents(i, n) + + def UpdateColumnWidths(self, *x): + # This only needs to be done once, so disconnect the signal now + self.model.rowsInserted.disconnect(self.UpdateColumnWidths) + self.ResizeColumnsToContents() + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.finder.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, row): + self.find_bar.Idle() + if row >= 0: + self.view.setCurrentIndex(self.model.index(row, 0, QModelIndex())) + else: + self.find_bar.NotFound() + +# Dialog data item converted and validated using a SQL table + +class SQLTableDialogDataItem(): + + def __init__(self, glb, label, placeholder_text, table_name, match_column, column_name1, column_name2, parent): + self.glb = glb + self.label = label + self.placeholder_text = placeholder_text + self.table_name = table_name + self.match_column = match_column + self.column_name1 = column_name1 + self.column_name2 = column_name2 + self.parent = parent + + self.value = "" + + self.widget = QLineEdit() + self.widget.editingFinished.connect(self.Validate) + self.widget.textChanged.connect(self.Invalidate) + self.red = False + self.error = "" + self.validated = True + + self.last_id = 0 + self.first_time = 0 + self.last_time = 2 ** 64 + if self.table_name == "": + query = QSqlQuery(self.glb.db) + QueryExec(query, "SELECT id, time FROM samples ORDER BY id DESC LIMIT 1") + if query.next(): + self.last_id = int(query.value(0)) + self.last_time = int(query.value(1)) + QueryExec(query, "SELECT time FROM samples WHERE time != 0 ORDER BY id LIMIT 1") + if query.next(): + self.first_time = int(query.value(0)) + if placeholder_text: + placeholder_text += ", between " + str(self.first_time) + " and " + str(self.last_time) + + if placeholder_text: + self.widget.setPlaceholderText(placeholder_text) + + def ValueToIds(self, value): + ids = [] + query = QSqlQuery(self.glb.db) + stmt = "SELECT id FROM " + self.table_name + " WHERE " + self.match_column + " = '" + value + "'" + ret = query.exec_(stmt) + if ret: + while query.next(): + ids.append(str(query.value(0))) + return ids + + def IdBetween(self, query, lower_id, higher_id, order): + QueryExec(query, "SELECT id FROM samples WHERE id > " + str(lower_id) + " AND id < " + str(higher_id) + " ORDER BY id " + order + " LIMIT 1") + if query.next(): + return True, int(query.value(0)) + else: + return False, 0 + + def BinarySearchTime(self, lower_id, higher_id, target_time, get_floor): + query = QSqlQuery(self.glb.db) + while True: + next_id = int((lower_id + higher_id) / 2) + QueryExec(query, "SELECT time FROM samples WHERE id = " + str(next_id)) + if not query.next(): + ok, dbid = self.IdBetween(query, lower_id, next_id, "DESC") + if not ok: + ok, dbid = self.IdBetween(query, next_id, higher_id, "") + if not ok: + return str(higher_id) + next_id = dbid + QueryExec(query, "SELECT time FROM samples WHERE id = " + str(next_id)) + next_time = int(query.value(0)) + if get_floor: + if target_time > next_time: + lower_id = next_id + else: + higher_id = next_id + if higher_id <= lower_id + 1: + return str(higher_id) + else: + if target_time >= next_time: + lower_id = next_id + else: + higher_id = next_id + if higher_id <= lower_id + 1: + return str(lower_id) + + def ConvertRelativeTime(self, val): + print "val ", val + mult = 1 + suffix = val[-2:] + if suffix == "ms": + mult = 1000000 + elif suffix == "us": + mult = 1000 + elif suffix == "ns": + mult = 1 + else: + return val + val = val[:-2].strip() + if not self.IsNumber(val): + return val + val = int(val) * mult + if val >= 0: + val += self.first_time + else: + val += self.last_time + return str(val) + + def ConvertTimeRange(self, vrange): + print "vrange ", vrange + if vrange[0] == "": + vrange[0] = str(self.first_time) + if vrange[1] == "": + vrange[1] = str(self.last_time) + vrange[0] = self.ConvertRelativeTime(vrange[0]) + vrange[1] = self.ConvertRelativeTime(vrange[1]) + print "vrange2 ", vrange + if not self.IsNumber(vrange[0]) or not self.IsNumber(vrange[1]): + return False + print "ok1" + beg_range = max(int(vrange[0]), self.first_time) + end_range = min(int(vrange[1]), self.last_time) + if beg_range > self.last_time or end_range < self.first_time: + return False + print "ok2" + vrange[0] = self.BinarySearchTime(0, self.last_id, beg_range, True) + vrange[1] = self.BinarySearchTime(1, self.last_id + 1, end_range, False) + print "vrange3 ", vrange + return True + + def AddTimeRange(self, value, ranges): + print "value ", value + n = value.count("-") + if n == 1: + pass + elif n == 2: + if value.split("-")[1].strip() == "": + n = 1 + elif n == 3: + n = 2 + else: + return False + pos = findnth(value, "-", n) + vrange = [value[:pos].strip() ,value[pos+1:].strip()] + if self.ConvertTimeRange(vrange): + ranges.append(vrange) + return True + return False + + def InvalidValue(self, value): + self.value = "" + palette = QPalette() + palette.setColor(QPalette.Text,Qt.red) + self.widget.setPalette(palette) + self.red = True + self.error = self.label + " invalid value '" + value + "'" + self.parent.ShowMessage(self.error) + + def IsNumber(self, value): + try: + x = int(value) + except: + x = 0 + return str(x) == value + + def Invalidate(self): + self.validated = False + + def Validate(self): + input_string = self.widget.text() + self.validated = True + if self.red: + palette = QPalette() + self.widget.setPalette(palette) + self.red = False + if not len(input_string.strip()): + self.error = "" + self.value = "" + return + if self.table_name == "": + ranges = [] + for value in [x.strip() for x in input_string.split(",")]: + if not self.AddTimeRange(value, ranges): + return self.InvalidValue(value) + ranges = [("(" + self.column_name1 + " >= " + r[0] + " AND " + self.column_name1 + " <= " + r[1] + ")") for r in ranges] + self.value = " OR ".join(ranges) + elif self.table_name == "": + singles = [] + ranges = [] + for value in [x.strip() for x in input_string.split(",")]: + if "-" in value: + vrange = value.split("-") + if len(vrange) != 2 or not self.IsNumber(vrange[0]) or not self.IsNumber(vrange[1]): + return self.InvalidValue(value) + ranges.append(vrange) + else: + if not self.IsNumber(value): + return self.InvalidValue(value) + singles.append(value) + ranges = [("(" + self.column_name1 + " >= " + r[0] + " AND " + self.column_name1 + " <= " + r[1] + ")") for r in ranges] + if len(singles): + ranges.append(self.column_name1 + " IN (" + ",".join(singles) + ")") + self.value = " OR ".join(ranges) + elif self.table_name: + all_ids = [] + for value in [x.strip() for x in input_string.split(",")]: + ids = self.ValueToIds(value) + if len(ids): + all_ids.extend(ids) + else: + return self.InvalidValue(value) + self.value = self.column_name1 + " IN (" + ",".join(all_ids) + ")" + if self.column_name2: + self.value = "( " + self.value + " OR " + self.column_name2 + " IN (" + ",".join(all_ids) + ") )" + else: + self.value = input_string.strip() + self.error = "" + self.parent.ClearMessage() + + def IsValid(self): + if not self.validated: + self.Validate() + if len(self.error): + self.parent.ShowMessage(self.error) + return False + return True + +# Selected branch report creation dialog + +class SelectedBranchDialog(QDialog): + + def __init__(self, glb, parent=None): + super(SelectedBranchDialog, self).__init__(parent) + + self.glb = glb + + self.name = "" + self.where_clause = "" + + self.setWindowTitle("Selected Branches") + self.setMinimumWidth(600) + + items = ( + ("Report name:", "Enter a name to appear in the window title bar", "", "", "", ""), + ("Time ranges:", "Enter time ranges", "", "", "samples.id", ""), + ("CPUs:", "Enter CPUs or ranges e.g. 0,5-6", "", "", "cpu", ""), + ("Commands:", "Only branches with these commands will be included", "comms", "comm", "comm_id", ""), + ("PIDs:", "Only branches with these process IDs will be included", "threads", "pid", "thread_id", ""), + ("TIDs:", "Only branches with these thread IDs will be included", "threads", "tid", "thread_id", ""), + ("DSOs:", "Only branches with these DSOs will be included", "dsos", "short_name", "samples.dso_id", "to_dso_id"), + ("Symbols:", "Only branches with these symbols will be included", "symbols", "name", "symbol_id", "to_symbol_id"), + ("Raw SQL clause: ", "Enter a raw SQL WHERE clause", "", "", "", ""), + ) + self.data_items = [SQLTableDialogDataItem(glb, *x, parent=self) for x in items] + + self.grid = QGridLayout() + + for row in xrange(len(self.data_items)): + self.grid.addWidget(QLabel(self.data_items[row].label), row, 0) + self.grid.addWidget(self.data_items[row].widget, row, 1) + + self.status = QLabel() + + self.ok_button = QPushButton("Ok", self) + self.ok_button.setDefault(True) + self.ok_button.released.connect(self.Ok) + self.ok_button.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.cancel_button = QPushButton("Cancel", self) + self.cancel_button.released.connect(self.reject) + self.cancel_button.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) + + self.hbox = QHBoxLayout() + #self.hbox.addStretch() + self.hbox.addWidget(self.status) + self.hbox.addWidget(self.ok_button) + self.hbox.addWidget(self.cancel_button) + + self.vbox = QVBoxLayout() + self.vbox.addLayout(self.grid) + self.vbox.addLayout(self.hbox) + + self.setLayout(self.vbox); + + def Ok(self): + self.name = self.data_items[0].value + if not self.name: + self.ShowMessage("Report name is required") + return + for d in self.data_items: + if not d.IsValid(): + return + for d in self.data_items[1:]: + if len(d.value): + if len(self.where_clause): + self.where_clause += " AND " + self.where_clause += d.value + if len(self.where_clause): + self.where_clause = " AND ( " + self.where_clause + " ) " + else: + self.ShowMessage("No selection") + return + self.accept() + + def ShowMessage(self, msg): + self.status.setText("" + msg) + + def ClearMessage(self): + self.status.setText("") + +# Event list + +def GetEventList(db): + events = [] + query = QSqlQuery(db) + QueryExec(query, "SELECT name FROM selected_events WHERE id > 0 ORDER BY id") + while query.next(): + events.append(query.value(0)) + return events + +# SQL data preparation + +def SQLTableDataPrep(query, count): + data = [] + for i in xrange(count): + data.append(query.value(i)) + return data + +# SQL table data model item + +class SQLTableItem(): + + def __init__(self, row, data): + self.row = row + self.data = data + + def getData(self, column): + return self.data[column] + +# SQL table data model + +class SQLTableModel(TableModel): + + progress = Signal(object) + + def __init__(self, glb, sql, column_count, parent=None): + super(SQLTableModel, self).__init__(parent) + self.glb = glb + self.more = True + self.populated = 0 + self.fetcher = SQLFetcher(glb, sql, lambda x, y=column_count: SQLTableDataPrep(x, y), self.AddSample) + self.fetcher.done.connect(self.Update) + self.fetcher.Fetch(glb_chunk_sz) + + def DisplayData(self, item, index): + self.FetchIfNeeded(item.row) + return item.getData(index.column()) + + def AddSample(self, data): + child = SQLTableItem(self.populated, data) + self.child_items.append(child) + self.populated += 1 + + def Update(self, fetched): + if not fetched: + self.more = False + self.progress.emit(0) + child_count = self.child_count + count = self.populated - child_count + if count > 0: + parent = QModelIndex() + self.beginInsertRows(parent, child_count, child_count + count - 1) + self.insertRows(child_count, count, parent) + self.child_count += count + self.endInsertRows() + self.progress.emit(self.child_count) + + def FetchMoreRecords(self, count): + current = self.child_count + if self.more: + self.fetcher.Fetch(count) + else: + self.progress.emit(0) + return current + + def HasMoreRecords(self): + return self.more + +# SQL automatic table data model + +class SQLAutoTableModel(SQLTableModel): + + def __init__(self, glb, table_name, parent=None): + sql = "SELECT * FROM " + table_name + " WHERE id > $$last_id$$ ORDER BY id LIMIT " + str(glb_chunk_sz) + if table_name == "comm_threads_view": + # For now, comm_threads_view has no id column + sql = "SELECT * FROM " + table_name + " WHERE comm_id > $$last_id$$ ORDER BY comm_id LIMIT " + str(glb_chunk_sz) + self.column_headers = [] + query = QSqlQuery(glb.db) + if glb.dbref.is_sqlite3: + QueryExec(query, "PRAGMA table_info(" + table_name + ")") + while query.next(): + self.column_headers.append(query.value(1)) + if table_name == "sqlite_master": + sql = "SELECT * FROM " + table_name + else: + if table_name[:19] == "information_schema.": + sql = "SELECT * FROM " + table_name + select_table_name = table_name[19:] + schema = "information_schema" + else: + select_table_name = table_name + schema = "public" + QueryExec(query, "SELECT column_name FROM information_schema.columns WHERE table_schema = '" + schema + "' and table_name = '" + select_table_name + "'") + while query.next(): + self.column_headers.append(query.value(0)) + super(SQLAutoTableModel, self).__init__(glb, sql, len(self.column_headers), parent) + + def columnCount(self, parent=None): + return len(self.column_headers) + + def columnHeader(self, column): + return self.column_headers[column] + +# Base class for custom ResizeColumnsToContents + +class ResizeColumnsToContentsBase(QObject): + + def __init__(self, parent=None): + super(ResizeColumnsToContentsBase, self).__init__(parent) + + def ResizeColumnToContents(self, column, n): + # Using the view's resizeColumnToContents() here is extrememly slow + # so implement a crude alternative + font = self.view.font() + metrics = QFontMetrics(font) + max = 0 + for row in xrange(n): + val = self.data_model.child_items[row].data[column] + len = metrics.width(str(val) + "MM") + max = len if len > max else max + val = self.data_model.columnHeader(column) + len = metrics.width(str(val) + "MM") + max = len if len > max else max + self.view.setColumnWidth(column, max) + + def ResizeColumnsToContents(self): + n = min(self.data_model.child_count, 100) + if n < 1: + # No data yet, so connect a signal to notify when there is + self.data_model.rowsInserted.connect(self.UpdateColumnWidths) + return + columns = self.data_model.columnCount() + for i in xrange(columns): + self.ResizeColumnToContents(i, n) + + def UpdateColumnWidths(self, *x): + # This only needs to be done once, so disconnect the signal now + self.data_model.rowsInserted.disconnect(self.UpdateColumnWidths) + self.ResizeColumnsToContents() + +# Table window + +class TableWindow(QMdiSubWindow, ResizeColumnsToContentsBase): + + def __init__(self, glb, table_name, parent=None): + super(TableWindow, self).__init__(parent) + + self.data_model = LookupCreateModel(table_name + " Table", lambda: SQLAutoTableModel(glb, table_name)) + + self.model = QSortFilterProxyModel() + self.model.setSourceModel(self.data_model) + + self.view = QTableView() + self.view.setModel(self.model) + self.view.setEditTriggers(QAbstractItemView.NoEditTriggers) + self.view.verticalHeader().setVisible(False) + self.view.sortByColumn(-1, Qt.AscendingOrder) + self.view.setSortingEnabled(True) + + self.ResizeColumnsToContents() + + self.find_bar = FindBar(self, self, True) + + self.finder = ChildDataItemFinder(self.data_model) + + self.fetch_bar = FetchMoreRecordsBar(self.data_model, self) + + self.vbox = VBox(self.view, self.find_bar.Widget(), self.fetch_bar.Widget()) + + self.setWidget(self.vbox.Widget()) + + AddSubWindow(glb.mainwindow.mdi_area, self, table_name + " Table") + + def Find(self, value, direction, pattern, context): + self.view.setFocus() + self.find_bar.Busy() + self.finder.Find(value, direction, pattern, context, self.FindDone) + + def FindDone(self, row): + self.find_bar.Idle() + if row >= 0: + self.view.setCurrentIndex(self.model.mapFromSource(self.data_model.index(row, 0, QModelIndex()))) + else: + self.find_bar.NotFound() + +# Table list + +def GetTableList(glb): + tables = [] + query = QSqlQuery(glb.db) + if glb.dbref.is_sqlite3: + QueryExec(query, "SELECT name FROM sqlite_master WHERE type IN ( 'table' , 'view' ) ORDER BY name") + else: + QueryExec(query, "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_type IN ( 'BASE TABLE' , 'VIEW' ) ORDER BY table_name") + while query.next(): + tables.append(query.value(0)) + if glb.dbref.is_sqlite3: + tables.append("sqlite_master") + else: + tables.append("information_schema.tables") + tables.append("information_schema.views") + tables.append("information_schema.columns") + return tables + +# Action Definition + +def CreateAction(label, tip, callback, parent=None, shortcut=None): + action = QAction(label, parent) + if shortcut != None: + action.setShortcuts(shortcut) + action.setStatusTip(tip) + action.triggered.connect(callback) + return action + +# Typical application actions + +def CreateExitAction(app, parent=None): + return CreateAction("&Quit", "Exit the application", app.closeAllWindows, parent, QKeySequence.Quit) + +# Typical MDI actions + +def CreateCloseActiveWindowAction(mdi_area): + return CreateAction("Cl&ose", "Close the active window", mdi_area.closeActiveSubWindow, mdi_area) + +def CreateCloseAllWindowsAction(mdi_area): + return CreateAction("Close &All", "Close all the windows", mdi_area.closeAllSubWindows, mdi_area) + +def CreateTileWindowsAction(mdi_area): + return CreateAction("&Tile", "Tile the windows", mdi_area.tileSubWindows, mdi_area) + +def CreateCascadeWindowsAction(mdi_area): + return CreateAction("&Cascade", "Cascade the windows", mdi_area.cascadeSubWindows, mdi_area) + +def CreateNextWindowAction(mdi_area): + return CreateAction("Ne&xt", "Move the focus to the next window", mdi_area.activateNextSubWindow, mdi_area, QKeySequence.NextChild) + +def CreatePreviousWindowAction(mdi_area): + return CreateAction("Pre&vious", "Move the focus to the previous window", mdi_area.activatePreviousSubWindow, mdi_area, QKeySequence.PreviousChild) + +# Typical MDI window menu + +class WindowMenu(): + + def __init__(self, mdi_area, menu): + self.mdi_area = mdi_area + self.window_menu = menu.addMenu("&Windows") + self.close_active_window = CreateCloseActiveWindowAction(mdi_area) + self.close_all_windows = CreateCloseAllWindowsAction(mdi_area) + self.tile_windows = CreateTileWindowsAction(mdi_area) + self.cascade_windows = CreateCascadeWindowsAction(mdi_area) + self.next_window = CreateNextWindowAction(mdi_area) + self.previous_window = CreatePreviousWindowAction(mdi_area) + self.window_menu.aboutToShow.connect(self.Update) + + def Update(self): + self.window_menu.clear() + sub_window_count = len(self.mdi_area.subWindowList()) + have_sub_windows = sub_window_count != 0 + self.close_active_window.setEnabled(have_sub_windows) + self.close_all_windows.setEnabled(have_sub_windows) + self.tile_windows.setEnabled(have_sub_windows) + self.cascade_windows.setEnabled(have_sub_windows) + self.next_window.setEnabled(have_sub_windows) + self.previous_window.setEnabled(have_sub_windows) + self.window_menu.addAction(self.close_active_window) + self.window_menu.addAction(self.close_all_windows) + self.window_menu.addSeparator() + self.window_menu.addAction(self.tile_windows) + self.window_menu.addAction(self.cascade_windows) + self.window_menu.addSeparator() + self.window_menu.addAction(self.next_window) + self.window_menu.addAction(self.previous_window) + if sub_window_count == 0: + return + self.window_menu.addSeparator() + nr = 1 + for sub_window in self.mdi_area.subWindowList(): + label = str(nr) + " " + sub_window.name + if nr < 10: + label = "&" + label + action = self.window_menu.addAction(label) + action.setCheckable(True) + action.setChecked(sub_window == self.mdi_area.activeSubWindow()) + action.triggered.connect(lambda x=nr: self.setActiveSubWindow(x)) + self.window_menu.addAction(action) + nr += 1 + + def setActiveSubWindow(self, nr): + self.mdi_area.setActiveSubWindow(self.mdi_area.subWindowList()[nr - 1]) + +# Help text + +glb_help_text = """ +

    Contents

    + +

    1. Reports

    +

    1.1 Context-Sensitive Call Graph

    +

    1.2 All branches

    +

    1.3 Selected branches

    +

    2. Tables

    +

    1. Reports

    +

    1.1 Context-Sensitive Call Graph

    +The result is a GUI window with a tree representing a context-sensitive +call-graph. Expanding a couple of levels of the tree and adjusting column +widths to suit will display something like: +
    +                                         Call Graph: pt_example
    +Call Path                          Object      Count   Time(ns)  Time(%)  Branch Count   Branch Count(%)
    +v- ls
    +    v- 2638:2638
    +        v- _start                  ld-2.19.so    1     10074071   100.0         211135            100.0
    +          |- unknown               unknown       1        13198     0.1              1              0.0
    +          >- _dl_start             ld-2.19.so    1      1400980    13.9          19637              9.3
    +          >- _d_linit_internal     ld-2.19.so    1       448152     4.4          11094              5.3
    +          v-__libc_start_main@plt  ls            1      8211741    81.5         180397             85.4
    +             >- _dl_fixup          ld-2.19.so    1         7607     0.1            108              0.1
    +             >- __cxa_atexit       libc-2.19.so  1        11737     0.1             10              0.0
    +             >- __libc_csu_init    ls            1        10354     0.1             10              0.0
    +             |- _setjmp            libc-2.19.so  1            0     0.0              4              0.0
    +             v- main               ls            1      8182043    99.6         180254             99.9
    +
    +

    Points to note:

    +
      +
    • The top level is a command name (comm)
    • +
    • The next level is a thread (pid:tid)
    • +
    • Subsequent levels are functions
    • +
    • 'Count' is the number of calls
    • +
    • 'Time' is the elapsed time until the function returns
    • +
    • Percentages are relative to the level above
    • +
    • 'Branch Count' is the total number of branches for that function and all functions that it calls +
    +

    Find

    +Ctrl-F displays a Find bar which finds function names by either an exact match or a pattern match. +The pattern matching symbols are ? for any character and * for zero or more characters. +

    1.2 All branches

    +The All branches report displays all branches in chronological order. +Not all data is fetched immediately. More records can be fetched using the Fetch bar provided. +

    Disassembly

    +Open a branch to display disassembly. This only works if: +
      +
    1. The disassembler is available. Currently, only Intel XED is supported - see Intel XED Setup
    2. +
    3. The object code is available. Currently, only the perf build ID cache is searched for object code. +The default directory ~/.debug can be overridden by setting environment variable PERF_BUILDID_DIR. +One exception is kcore where the DSO long name is used (refer dsos_view on the Tables menu), +or alternatively, set environment variable PERF_KCORE to the kcore file name.
    4. +
    +

    Intel XED Setup

    +To use Intel XED, libxed.so must be present. To build and install libxed.so: +
    +git clone https://github.com/intelxed/mbuild.git mbuild
    +git clone https://github.com/intelxed/xed
    +cd xed
    +./mfile.py --share
    +sudo ./mfile.py --prefix=/usr/local install
    +sudo ldconfig
    +
    +

    Find

    +Ctrl-F displays a Find bar which finds substrings by either an exact match or a regular expression match. +Refer to Python documentation for the regular expression syntax. +All columns are searched, but only currently fetched rows are searched. +

    1.3 Selected branches

    +This is the same as the All branches report but with the data reduced +by various selection criteria. A dialog box displays available criteria which are AND'ed together. +

    1.3.1 Time ranges

    +The time ranges hint text shows the total time range. Relative time ranges can also be entered in +ms, us or ns. Also, negative values are relative to the end of trace. Examples: +
    +	81073085947329-81073085958238	From 81073085947329 to 81073085958238
    +	100us-200us		From 100us to 200us
    +	10ms-			From 10ms to the end
    +	-100ns			The first 100ns
    +	-10ms-			The last 10ms
    +
    +N.B. Due to the granularity of timestamps, there could be no branches in any given time range. +

    2. Tables

    +The Tables menu shows all tables and views in the database. Most tables have an associated view +which displays the information in a more friendly way. Not all data for large tables is fetched +immediately. More records can be fetched using the Fetch bar provided. Columns can be sorted, +but that can be slow for large tables. +

    There are also tables of database meta-information. +For SQLite3 databases, the sqlite_master table is included. +For PostgreSQL databases, information_schema.tables/views/columns are included. +

    Find

    +Ctrl-F displays a Find bar which finds substrings by either an exact match or a regular expression match. +Refer to Python documentation for the regular expression syntax. +All columns are searched, but only currently fetched rows are searched. +

    N.B. Results are found in id order, so if the table is re-ordered, find-next and find-previous +will go to the next/previous result in id order, instead of display order. +""" + +# Help window + +class HelpWindow(QMdiSubWindow): + + def __init__(self, glb, parent=None): + super(HelpWindow, self).__init__(parent) + + self.text = QTextBrowser() + self.text.setHtml(glb_help_text) + self.text.setReadOnly(True) + self.text.setOpenExternalLinks(True) + + self.setWidget(self.text) + + AddSubWindow(glb.mainwindow.mdi_area, self, "Exported SQL Viewer Help") + +# Main window that only displays the help text + +class HelpOnlyWindow(QMainWindow): + + def __init__(self, parent=None): + super(HelpOnlyWindow, self).__init__(parent) + + self.setMinimumSize(200, 100) + self.resize(800, 600) + self.setWindowTitle("Exported SQL Viewer Help") + self.setWindowIcon(self.style().standardIcon(QStyle.SP_MessageBoxInformation)) + + self.text = QTextBrowser() + self.text.setHtml(glb_help_text) + self.text.setReadOnly(True) + self.text.setOpenExternalLinks(True) + + self.setCentralWidget(self.text) + +# Font resize + +def ResizeFont(widget, diff): + font = widget.font() + sz = font.pointSize() + font.setPointSize(sz + diff) + widget.setFont(font) + +def ShrinkFont(widget): + ResizeFont(widget, -1) + +def EnlargeFont(widget): + ResizeFont(widget, 1) + +# Unique name for sub-windows + +def NumberedWindowName(name, nr): + if nr > 1: + name += " <" + str(nr) + ">" + return name + +def UniqueSubWindowName(mdi_area, name): + nr = 1 + while True: + unique_name = NumberedWindowName(name, nr) + ok = True + for sub_window in mdi_area.subWindowList(): + if sub_window.name == unique_name: + ok = False + break + if ok: + return unique_name + nr += 1 + +# Add a sub-window + +def AddSubWindow(mdi_area, sub_window, name): + unique_name = UniqueSubWindowName(mdi_area, name) + sub_window.setMinimumSize(200, 100) + sub_window.resize(800, 600) + sub_window.setWindowTitle(unique_name) + sub_window.setAttribute(Qt.WA_DeleteOnClose) + sub_window.setWindowIcon(sub_window.style().standardIcon(QStyle.SP_FileIcon)) + sub_window.name = unique_name + mdi_area.addSubWindow(sub_window) + sub_window.show() + +# Main window + +class MainWindow(QMainWindow): + + def __init__(self, glb, parent=None): + super(MainWindow, self).__init__(parent) + + self.glb = glb + + self.setWindowTitle("Exported SQL Viewer: " + glb.dbname) + self.setWindowIcon(self.style().standardIcon(QStyle.SP_ComputerIcon)) + self.setMinimumSize(200, 100) + + self.mdi_area = QMdiArea() + self.mdi_area.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) + self.mdi_area.setVerticalScrollBarPolicy(Qt.ScrollBarAsNeeded) + + self.setCentralWidget(self.mdi_area) + + menu = self.menuBar() + + file_menu = menu.addMenu("&File") + file_menu.addAction(CreateExitAction(glb.app, self)) + + edit_menu = menu.addMenu("&Edit") + edit_menu.addAction(CreateAction("&Find...", "Find items", self.Find, self, QKeySequence.Find)) + edit_menu.addAction(CreateAction("Fetch &more records...", "Fetch more records", self.FetchMoreRecords, self, [QKeySequence(Qt.Key_F8)])) + edit_menu.addAction(CreateAction("&Shrink Font", "Make text smaller", self.ShrinkFont, self, [QKeySequence("Ctrl+-")])) + edit_menu.addAction(CreateAction("&Enlarge Font", "Make text bigger", self.EnlargeFont, self, [QKeySequence("Ctrl++")])) + + reports_menu = menu.addMenu("&Reports") + reports_menu.addAction(CreateAction("Context-Sensitive Call &Graph", "Create a new window containing a context-sensitive call graph", self.NewCallGraph, self)) + + self.EventMenu(GetEventList(glb.db), reports_menu) + + self.TableMenu(GetTableList(glb), menu) + + self.window_menu = WindowMenu(self.mdi_area, menu) + + help_menu = menu.addMenu("&Help") + help_menu.addAction(CreateAction("&Exported SQL Viewer Help", "Helpful information", self.Help, self, QKeySequence.HelpContents)) + + def Find(self): + win = self.mdi_area.activeSubWindow() + if win: + try: + win.find_bar.Activate() + except: + pass + + def FetchMoreRecords(self): + win = self.mdi_area.activeSubWindow() + if win: + try: + win.fetch_bar.Activate() + except: + pass + + def ShrinkFont(self): + win = self.mdi_area.activeSubWindow() + ShrinkFont(win.view) + + def EnlargeFont(self): + win = self.mdi_area.activeSubWindow() + EnlargeFont(win.view) + + def EventMenu(self, events, reports_menu): + branches_events = 0 + for event in events: + event = event.split(":")[0] + if event == "branches": + branches_events += 1 + dbid = 0 + for event in events: + dbid += 1 + event = event.split(":")[0] + if event == "branches": + label = "All branches" if branches_events == 1 else "All branches " + "(id=" + dbid + ")" + reports_menu.addAction(CreateAction(label, "Create a new window displaying branch events", lambda x=dbid: self.NewBranchView(x), self)) + label = "Selected branches" if branches_events == 1 else "Selected branches " + "(id=" + dbid + ")" + reports_menu.addAction(CreateAction(label, "Create a new window displaying branch events", lambda x=dbid: self.NewSelectedBranchView(x), self)) + + def TableMenu(self, tables, menu): + table_menu = menu.addMenu("&Tables") + for table in tables: + table_menu.addAction(CreateAction(table, "Create a new window containing a table view", lambda t=table: self.NewTableView(t), self)) + + def NewCallGraph(self): + CallGraphWindow(self.glb, self) + + def NewBranchView(self, event_id): + BranchWindow(self.glb, event_id, "", "", self) + + def NewSelectedBranchView(self, event_id): + dialog = SelectedBranchDialog(self.glb, self) + ret = dialog.exec_() + if ret: + BranchWindow(self.glb, event_id, dialog.name, dialog.where_clause, self) + + def NewTableView(self, table_name): + TableWindow(self.glb, table_name, self) + + def Help(self): + HelpWindow(self.glb, self) + +# XED Disassembler + +class xed_state_t(Structure): + + _fields_ = [ + ("mode", c_int), + ("width", c_int) + ] + +class XEDInstruction(): + + def __init__(self, libxed): + # Current xed_decoded_inst_t structure is 192 bytes. Use 512 to allow for future expansion + xedd_t = c_byte * 512 + self.xedd = xedd_t() + self.xedp = addressof(self.xedd) + libxed.xed_decoded_inst_zero(self.xedp) + self.state = xed_state_t() + self.statep = addressof(self.state) + # Buffer for disassembled instruction text + self.buffer = create_string_buffer(256) + self.bufferp = addressof(self.buffer) + +class LibXED(): + + def __init__(self): + try: + self.libxed = CDLL("libxed.so") + except: + self.libxed = None + if not self.libxed: + self.libxed = CDLL("/usr/local/lib/libxed.so") + + self.xed_tables_init = self.libxed.xed_tables_init + self.xed_tables_init.restype = None + self.xed_tables_init.argtypes = [] + + self.xed_decoded_inst_zero = self.libxed.xed_decoded_inst_zero + self.xed_decoded_inst_zero.restype = None + self.xed_decoded_inst_zero.argtypes = [ c_void_p ] + + self.xed_operand_values_set_mode = self.libxed.xed_operand_values_set_mode + self.xed_operand_values_set_mode.restype = None + self.xed_operand_values_set_mode.argtypes = [ c_void_p, c_void_p ] + + self.xed_decoded_inst_zero_keep_mode = self.libxed.xed_decoded_inst_zero_keep_mode + self.xed_decoded_inst_zero_keep_mode.restype = None + self.xed_decoded_inst_zero_keep_mode.argtypes = [ c_void_p ] + + self.xed_decode = self.libxed.xed_decode + self.xed_decode.restype = c_int + self.xed_decode.argtypes = [ c_void_p, c_void_p, c_uint ] + + self.xed_format_context = self.libxed.xed_format_context + self.xed_format_context.restype = c_uint + self.xed_format_context.argtypes = [ c_int, c_void_p, c_void_p, c_int, c_ulonglong, c_void_p, c_void_p ] + + self.xed_tables_init() + + def Instruction(self): + return XEDInstruction(self) + + def SetMode(self, inst, mode): + if mode: + inst.state.mode = 4 # 32-bit + inst.state.width = 4 # 4 bytes + else: + inst.state.mode = 1 # 64-bit + inst.state.width = 8 # 8 bytes + self.xed_operand_values_set_mode(inst.xedp, inst.statep) + + def DisassembleOne(self, inst, bytes_ptr, bytes_cnt, ip): + self.xed_decoded_inst_zero_keep_mode(inst.xedp) + err = self.xed_decode(inst.xedp, bytes_ptr, bytes_cnt) + if err: + return 0, "" + # Use AT&T mode (2), alternative is Intel (3) + ok = self.xed_format_context(2, inst.xedp, inst.bufferp, sizeof(inst.buffer), ip, 0, 0) + if not ok: + return 0, "" + # Return instruction length and the disassembled instruction text + # For now, assume the length is in byte 166 + return inst.xedd[166], inst.buffer.value + +def TryOpen(file_name): + try: + return open(file_name, "rb") + except: + return None + +def Is64Bit(f): + result = sizeof(c_void_p) + # ELF support only + pos = f.tell() + f.seek(0) + header = f.read(7) + f.seek(pos) + magic = header[0:4] + eclass = ord(header[4]) + encoding = ord(header[5]) + version = ord(header[6]) + if magic == chr(127) + "ELF" and eclass > 0 and eclass < 3 and encoding > 0 and encoding < 3 and version == 1: + result = True if eclass == 2 else False + return result + +# Global data + +class Glb(): + + def __init__(self, dbref, db, dbname): + self.dbref = dbref + self.db = db + self.dbname = dbname + self.home_dir = os.path.expanduser("~") + self.buildid_dir = os.getenv("PERF_BUILDID_DIR") + if self.buildid_dir: + self.buildid_dir += "/.build-id/" + else: + self.buildid_dir = self.home_dir + "/.debug/.build-id/" + self.app = None + self.mainwindow = None + self.instances_to_shutdown_on_exit = weakref.WeakSet() + try: + self.disassembler = LibXED() + self.have_disassembler = True + except: + self.have_disassembler = False + + def FileFromBuildId(self, build_id): + file_name = self.buildid_dir + build_id[0:2] + "/" + build_id[2:] + "/elf" + return TryOpen(file_name) + + def FileFromNamesAndBuildId(self, short_name, long_name, build_id): + # Assume current machine i.e. no support for virtualization + if short_name[0:7] == "[kernel" and os.path.basename(long_name) == "kcore": + file_name = os.getenv("PERF_KCORE") + f = TryOpen(file_name) if file_name else None + if f: + return f + # For now, no special handling if long_name is /proc/kcore + f = TryOpen(long_name) + if f: + return f + f = self.FileFromBuildId(build_id) + if f: + return f + return None + + def AddInstanceToShutdownOnExit(self, instance): + self.instances_to_shutdown_on_exit.add(instance) + + # Shutdown any background processes or threads + def ShutdownInstances(self): + for x in self.instances_to_shutdown_on_exit: + try: + x.Shutdown() + except: + pass + +# Database reference + +class DBRef(): + + def __init__(self, is_sqlite3, dbname): + self.is_sqlite3 = is_sqlite3 + self.dbname = dbname + + def Open(self, connection_name): + dbname = self.dbname + if self.is_sqlite3: + db = QSqlDatabase.addDatabase("QSQLITE", connection_name) + else: + db = QSqlDatabase.addDatabase("QPSQL", connection_name) + opts = dbname.split() + for opt in opts: + if "=" in opt: + opt = opt.split("=") + if opt[0] == "hostname": + db.setHostName(opt[1]) + elif opt[0] == "port": + db.setPort(int(opt[1])) + elif opt[0] == "username": + db.setUserName(opt[1]) + elif opt[0] == "password": + db.setPassword(opt[1]) + elif opt[0] == "dbname": + dbname = opt[1] + else: + dbname = opt + + db.setDatabaseName(dbname) + if not db.open(): + raise Exception("Failed to open database " + dbname + " error: " + db.lastError().text()) + return db, dbname + +# Main + +def Main(): + if (len(sys.argv) < 2): + print >> sys.stderr, "Usage is: exported-sql-viewer.py { | --help-only}" + raise Exception("Too few arguments") + + dbname = sys.argv[1] + if dbname == "--help-only": + app = QApplication(sys.argv) + mainwindow = HelpOnlyWindow() + mainwindow.show() + err = app.exec_() + sys.exit(err) + + is_sqlite3 = False + try: + f = open(dbname) + if f.read(15) == "SQLite format 3": + is_sqlite3 = True + f.close() + except: + pass + + dbref = DBRef(is_sqlite3, dbname) + db, dbname = dbref.Open("main") + glb = Glb(dbref, db, dbname) + app = QApplication(sys.argv) + glb.app = app + mainwindow = MainWindow(glb) + glb.mainwindow = mainwindow + mainwindow.show() + err = app.exec_() + glb.ShutdownInstances() + db.close() + sys.exit(err) + +if __name__ == "__main__": + Main() diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 6c108fa79ae3..0b2b8305c965 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -21,6 +21,7 @@ perf-y += python-use.o perf-y += bp_signal.o perf-y += bp_signal_overflow.o perf-y += bp_account.o +perf-y += wp.o perf-y += task-exit.o perf-y += sw-clock.o perf-y += mmap-thread-lookup.o diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling index 8a33ca4f9e1f..f0729c454f16 100644 --- a/tools/perf/tests/attr/test-record-group-sampling +++ b/tools/perf/tests/attr/test-record-group-sampling @@ -37,4 +37,3 @@ sample_freq=0 sample_period=0 freq=0 write_backward=0 -sample_id_all=0 diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index d7a5e1b9aa6f..12c09e0ece71 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -120,6 +120,16 @@ static struct test generic_tests[] = { .func = test__bp_accounting, .is_supported = test__bp_signal_is_supported, }, + { + .desc = "Watchpoint", + .func = test__wp, + .is_supported = test__wp_is_supported, + .subtest = { + .skip_if_fail = false, + .get_nr = test__wp_subtest_get_nr, + .get_desc = test__wp_subtest_get_desc, + }, + }, { .desc = "Number of exit events of a simple workload", .func = test__task_exit, diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c index 699561fa512c..5f8501c68da4 100644 --- a/tools/perf/tests/evsel-tp-sched.c +++ b/tools/perf/tests/evsel-tp-sched.c @@ -8,7 +8,7 @@ static int perf_evsel__test_field(struct perf_evsel *evsel, const char *name, int size, bool should_be_signed) { - struct format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = perf_evsel__field(evsel, name); int is_signed; int ret = 0; @@ -17,7 +17,7 @@ static int perf_evsel__test_field(struct perf_evsel *evsel, const char *name, return -1; } - is_signed = !!(field->flags | FIELD_IS_SIGNED); + is_signed = !!(field->flags | TEP_FIELD_IS_SIGNED); if (should_be_signed && !is_signed) { pr_debug("%s: \"%s\" signedness(%d) is wrong, should be %d\n", evsel->name, name, is_signed, should_be_signed); diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index 3013ac8f83d0..cab7b0aea6ea 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -48,7 +48,7 @@ trace_libc_inet_pton_backtrace() { *) eventattr='max-stack=3' echo "getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc\)$" >> $expected - echo ".*\+0x[[:xdigit:]]+[[:space:]]\(.*/bin/ping.*\)$" >> $expected + echo ".*(\+0x[[:xdigit:]]+|\[unknown\])[[:space:]]\(.*/bin/ping.*\)$" >> $expected ;; esac diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index a9760e790563..b82f55fcc294 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -59,6 +59,9 @@ int test__python_use(struct test *test, int subtest); int test__bp_signal(struct test *test, int subtest); int test__bp_signal_overflow(struct test *test, int subtest); int test__bp_accounting(struct test *test, int subtest); +int test__wp(struct test *test, int subtest); +const char *test__wp_subtest_get_desc(int subtest); +int test__wp_subtest_get_nr(void); int test__task_exit(struct test *test, int subtest); int test__mem(struct test *test, int subtest); int test__sw_clock_freq(struct test *test, int subtest); @@ -106,6 +109,7 @@ int test__unit_number__scnprint(struct test *test, int subtest); int test__mem2node(struct test *t, int subtest); bool test__bp_signal_is_supported(void); +bool test__wp_is_supported(void); #if defined(__arm__) || defined(__aarch64__) #ifdef HAVE_DWARF_UNWIND_SUPPORT diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c new file mode 100644 index 000000000000..f89e6806557b --- /dev/null +++ b/tools/perf/tests/wp.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "tests.h" +#include "debug.h" +#include "cloexec.h" + +#define WP_TEST_ASSERT_VAL(fd, text, val) \ +do { \ + long long count; \ + wp_read(fd, &count, sizeof(long long)); \ + TEST_ASSERT_VAL(text, count == val); \ +} while (0) + +volatile u64 data1; +volatile u8 data2[3]; + +static int wp_read(int fd, long long *count, int size) +{ + int ret = read(fd, count, size); + + if (ret != size) { + pr_debug("failed to read: %d\n", ret); + return -1; + } + return 0; +} + +static void get__perf_event_attr(struct perf_event_attr *attr, int wp_type, + void *wp_addr, unsigned long wp_len) +{ + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->config = 0; + attr->bp_type = wp_type; + attr->bp_addr = (unsigned long)wp_addr; + attr->bp_len = wp_len; + attr->sample_period = 1; + attr->sample_type = PERF_SAMPLE_IP; + attr->exclude_kernel = 1; + attr->exclude_hv = 1; +} + +static int __event(int wp_type, void *wp_addr, unsigned long wp_len) +{ + int fd; + struct perf_event_attr attr; + + get__perf_event_attr(&attr, wp_type, wp_addr, wp_len); + fd = sys_perf_event_open(&attr, 0, -1, -1, + perf_event_open_cloexec_flag()); + if (fd < 0) + pr_debug("failed opening event %x\n", attr.bp_type); + + return fd; +} + +static int wp_ro_test(void) +{ + int fd; + unsigned long tmp, tmp1 = rand(); + + fd = __event(HW_BREAKPOINT_R, (void *)&data1, sizeof(data1)); + if (fd < 0) + return -1; + + tmp = data1; + WP_TEST_ASSERT_VAL(fd, "RO watchpoint", 1); + + data1 = tmp1 + tmp; + WP_TEST_ASSERT_VAL(fd, "RO watchpoint", 1); + + close(fd); + return 0; +} + +static int wp_wo_test(void) +{ + int fd; + unsigned long tmp, tmp1 = rand(); + + fd = __event(HW_BREAKPOINT_W, (void *)&data1, sizeof(data1)); + if (fd < 0) + return -1; + + tmp = data1; + WP_TEST_ASSERT_VAL(fd, "WO watchpoint", 0); + + data1 = tmp1 + tmp; + WP_TEST_ASSERT_VAL(fd, "WO watchpoint", 1); + + close(fd); + return 0; +} + +static int wp_rw_test(void) +{ + int fd; + unsigned long tmp, tmp1 = rand(); + + fd = __event(HW_BREAKPOINT_R | HW_BREAKPOINT_W, (void *)&data1, + sizeof(data1)); + if (fd < 0) + return -1; + + tmp = data1; + WP_TEST_ASSERT_VAL(fd, "RW watchpoint", 1); + + data1 = tmp1 + tmp; + WP_TEST_ASSERT_VAL(fd, "RW watchpoint", 2); + + close(fd); + return 0; +} + +static int wp_modify_test(void) +{ + int fd, ret; + unsigned long tmp = rand(); + struct perf_event_attr new_attr; + + fd = __event(HW_BREAKPOINT_W, (void *)&data1, sizeof(data1)); + if (fd < 0) + return -1; + + data1 = tmp; + WP_TEST_ASSERT_VAL(fd, "Modify watchpoint", 1); + + /* Modify watchpoint with disabled = 1 */ + get__perf_event_attr(&new_attr, HW_BREAKPOINT_W, (void *)&data2[0], + sizeof(u8) * 2); + new_attr.disabled = 1; + ret = ioctl(fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &new_attr); + if (ret < 0) { + pr_debug("ioctl(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) failed\n"); + close(fd); + return ret; + } + + data2[1] = tmp; /* Not Counted */ + WP_TEST_ASSERT_VAL(fd, "Modify watchpoint", 1); + + /* Enable the event */ + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + if (ret < 0) { + pr_debug("Failed to enable event\n"); + close(fd); + return ret; + } + + data2[1] = tmp; /* Counted */ + WP_TEST_ASSERT_VAL(fd, "Modify watchpoint", 2); + + data2[2] = tmp; /* Not Counted */ + WP_TEST_ASSERT_VAL(fd, "Modify watchpoint", 2); + + close(fd); + return 0; +} + +static bool wp_ro_supported(void) +{ +#if defined (__x86_64__) || defined (__i386__) + return false; +#else + return true; +#endif +} + +static void wp_ro_skip_msg(void) +{ +#if defined (__x86_64__) || defined (__i386__) + pr_debug("Hardware does not support read only watchpoints.\n"); +#endif +} + +static struct { + const char *desc; + int (*target_func)(void); + bool (*is_supported)(void); + void (*skip_msg)(void); +} wp_testcase_table[] = { + { + .desc = "Read Only Watchpoint", + .target_func = &wp_ro_test, + .is_supported = &wp_ro_supported, + .skip_msg = &wp_ro_skip_msg, + }, + { + .desc = "Write Only Watchpoint", + .target_func = &wp_wo_test, + }, + { + .desc = "Read / Write Watchpoint", + .target_func = &wp_rw_test, + }, + { + .desc = "Modify Watchpoint", + .target_func = &wp_modify_test, + }, +}; + +int test__wp_subtest_get_nr(void) +{ + return (int)ARRAY_SIZE(wp_testcase_table); +} + +const char *test__wp_subtest_get_desc(int i) +{ + if (i < 0 || i >= (int)ARRAY_SIZE(wp_testcase_table)) + return NULL; + return wp_testcase_table[i].desc; +} + +int test__wp(struct test *test __maybe_unused, int i) +{ + if (i < 0 || i >= (int)ARRAY_SIZE(wp_testcase_table)) + return TEST_FAIL; + + if (wp_testcase_table[i].is_supported && + !wp_testcase_table[i].is_supported()) { + wp_testcase_table[i].skip_msg(); + return TEST_SKIP; + } + + return !wp_testcase_table[i].target_func() ? TEST_OK : TEST_FAIL; +} + +/* The s390 so far does not have support for + * instruction breakpoint using the perf_event_open() system call. + */ +bool test__wp_is_supported(void) +{ +#if defined(__s390x__) + return false; +#else + return true; +#endif +} diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index f528ba35e140..304313073242 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -5,7 +5,9 @@ ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) libperf-y += ioctl.o endif libperf-y += kcmp.o +libperf-y += mount_flags.o libperf-y += pkey_alloc.o libperf-y += prctl.o +libperf-y += sockaddr.o libperf-y += socket.o libperf-y += statx.o diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 9615af5d412b..039c29039b2c 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -24,15 +24,43 @@ struct strarray { } size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val); +size_t strarray__scnprintf_flags(struct strarray *sa, char *bf, size_t size, unsigned long flags); struct trace; struct thread; size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size); +extern struct strarray strarray__socket_families; + +/** + * augmented_arg: extra payload for syscall pointer arguments + + * If perf_sample->raw_size is more than what a syscall sys_enter_FOO puts, + * then its the arguments contents, so that we can show more than just a + * pointer. This will be done initially with eBPF, the start of that is at the + * tools/perf/examples/bpf/augmented_syscalls.c example for the openat, but + * will eventually be done automagically caching the running kernel tracefs + * events data into an eBPF C script, that then gets compiled and its .o file + * cached for subsequent use. For char pointers like the ones for 'open' like + * syscalls its easy, for the rest we should use DWARF or better, BTF, much + * more compact. + * + * @size: 8 if all we need is an integer, otherwise all of the augmented arg. + * @int_arg: will be used for integer like pointer contents, like 'accept's 'upeer_addrlen' + * @value: u64 aligned, for structs, pathnames + */ +struct augmented_arg { + int size; + int int_arg; + u64 value[]; +}; + /** * @val: value of syscall argument being formatted * @args: All the args, use syscall_args__val(arg, nth) to access one + * @augmented_args: Extra data that can be collected, for instance, with eBPF for expanding the pathname for open, etc + * @augmented_args_size: augmented_args total payload size * @thread: tid state (maps, pid, tid, etc) * @trace: 'perf trace' internals: all threads, etc * @parm: private area, may be an strarray, for instance @@ -43,6 +71,10 @@ size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_ struct syscall_arg { unsigned long val; unsigned char *args; + struct { + struct augmented_arg *args; + int size; + } augmented; struct thread *thread; struct trace *trace; void *parm; @@ -91,6 +123,12 @@ size_t syscall_arg__scnprintf_kcmp_type(char *bf, size_t size, struct syscall_ar size_t syscall_arg__scnprintf_kcmp_idx(char *bf, size_t size, struct syscall_arg *arg); #define SCA_KCMP_IDX syscall_arg__scnprintf_kcmp_idx +unsigned long syscall_arg__mask_val_mount_flags(struct syscall_arg *arg, unsigned long flags); +#define SCAMV_MOUNT_FLAGS syscall_arg__mask_val_mount_flags + +size_t syscall_arg__scnprintf_mount_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_MOUNT_FLAGS syscall_arg__scnprintf_mount_flags + size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PKEY_ALLOC_ACCESS_RIGHTS syscall_arg__scnprintf_pkey_alloc_access_rights @@ -106,6 +144,9 @@ size_t syscall_arg__scnprintf_prctl_arg2(char *bf, size_t size, struct syscall_a size_t syscall_arg__scnprintf_prctl_arg3(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PRCTL_ARG3 syscall_arg__scnprintf_prctl_arg3 +size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_SOCKADDR syscall_arg__scnprintf_sockaddr + size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct syscall_arg *arg); #define SCA_SK_PROTO syscall_arg__scnprintf_socket_protocol diff --git a/tools/perf/trace/beauty/clone.c b/tools/perf/trace/beauty/clone.c index d64d049ab991..010406500c30 100644 --- a/tools/perf/trace/beauty/clone.c +++ b/tools/perf/trace/beauty/clone.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/cone.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/drm_ioctl.sh b/tools/perf/trace/beauty/drm_ioctl.sh index 9d3816815e60..9aa94fd523a9 100755 --- a/tools/perf/trace/beauty/drm_ioctl.sh +++ b/tools/perf/trace/beauty/drm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/drm/ diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c index 5d6a477a6400..db5b9b492113 100644 --- a/tools/perf/trace/beauty/eventfd.c +++ b/tools/perf/trace/beauty/eventfd.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef EFD_SEMAPHORE #define EFD_SEMAPHORE 1 #endif diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c index 9e8900c13cb1..e6de31674e24 100644 --- a/tools/perf/trace/beauty/fcntl.c +++ b/tools/perf/trace/beauty/fcntl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/fcntl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c index c4ff6ad30b06..cf02ae5f0ba6 100644 --- a/tools/perf/trace/beauty/flock.c +++ b/tools/perf/trace/beauty/flock.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include "trace/beauty/beauty.h" #include diff --git a/tools/perf/trace/beauty/futex_op.c b/tools/perf/trace/beauty/futex_op.c index 61850fbc85ff..1136bde56406 100644 --- a/tools/perf/trace/beauty/futex_op.c +++ b/tools/perf/trace/beauty/futex_op.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #ifndef FUTEX_WAIT_BITSET diff --git a/tools/perf/trace/beauty/futex_val3.c b/tools/perf/trace/beauty/futex_val3.c index 26f6b3253511..138b7d588a70 100644 --- a/tools/perf/trace/beauty/futex_val3.c +++ b/tools/perf/trace/beauty/futex_val3.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #ifndef FUTEX_BITSET_MATCH_ANY diff --git a/tools/perf/trace/beauty/ioctl.c b/tools/perf/trace/beauty/ioctl.c index 1be3b4cf0827..5d2a7fd8d407 100644 --- a/tools/perf/trace/beauty/ioctl.c +++ b/tools/perf/trace/beauty/ioctl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/ioctl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/kcmp.c b/tools/perf/trace/beauty/kcmp.c index f62040eb9d5c..b276a274f203 100644 --- a/tools/perf/trace/beauty/kcmp.c +++ b/tools/perf/trace/beauty/kcmp.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/kcmp.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/kcmp_type.sh b/tools/perf/trace/beauty/kcmp_type.sh index a3c304caa336..df8b17486d57 100755 --- a/tools/perf/trace/beauty/kcmp_type.sh +++ b/tools/perf/trace/beauty/kcmp_type.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/kvm_ioctl.sh b/tools/perf/trace/beauty/kvm_ioctl.sh index c4699fd46bb6..4ce54f5bf756 100755 --- a/tools/perf/trace/beauty/kvm_ioctl.sh +++ b/tools/perf/trace/beauty/kvm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/madvise_behavior.sh b/tools/perf/trace/beauty/madvise_behavior.sh index 431639eb4d29..4527d290cdfc 100755 --- a/tools/perf/trace/beauty/madvise_behavior.sh +++ b/tools/perf/trace/beauty/madvise_behavior.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/asm-generic/ diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c index 9f68077b241b..c534bd96ef5c 100644 --- a/tools/perf/trace/beauty/mmap.c +++ b/tools/perf/trace/beauty/mmap.c @@ -1,5 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include +#include static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, struct syscall_arg *arg) @@ -30,50 +31,23 @@ static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot +static size_t mmap__scnprintf_flags(unsigned long flags, char *bf, size_t size) +{ +#include "trace/beauty/generated/mmap_flags_array.c" + static DEFINE_STRARRAY(mmap_flags); + + return strarray__scnprintf_flags(&strarray__mmap_flags, bf, size, flags); +} + static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, struct syscall_arg *arg) { - int printed = 0, flags = arg->val; + unsigned long flags = arg->val; if (flags & MAP_ANONYMOUS) arg->mask |= (1 << 4) | (1 << 5); /* Mask 4th ('fd') and 5th ('offset') args, ignored */ -#define P_MMAP_FLAG(n) \ - if (flags & MAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MAP_##n; \ - } - - P_MMAP_FLAG(SHARED); - P_MMAP_FLAG(PRIVATE); -#ifdef MAP_32BIT - P_MMAP_FLAG(32BIT); -#endif - P_MMAP_FLAG(ANONYMOUS); - P_MMAP_FLAG(DENYWRITE); - P_MMAP_FLAG(EXECUTABLE); - P_MMAP_FLAG(FILE); - P_MMAP_FLAG(FIXED); -#ifdef MAP_FIXED_NOREPLACE - P_MMAP_FLAG(FIXED_NOREPLACE); -#endif - P_MMAP_FLAG(GROWSDOWN); - P_MMAP_FLAG(HUGETLB); - P_MMAP_FLAG(LOCKED); - P_MMAP_FLAG(NONBLOCK); - P_MMAP_FLAG(NORESERVE); - P_MMAP_FLAG(POPULATE); - P_MMAP_FLAG(STACK); - P_MMAP_FLAG(UNINITIALIZED); -#ifdef MAP_SYNC - P_MMAP_FLAG(SYNC); -#endif -#undef P_MMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; + return mmap__scnprintf_flags(flags, bf, size); } #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags diff --git a/tools/perf/trace/beauty/mmap_flags.sh b/tools/perf/trace/beauty/mmap_flags.sh new file mode 100755 index 000000000000..22c3fdca8975 --- /dev/null +++ b/tools/perf/trace/beauty/mmap_flags.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 2 ] ; then + [ $# -eq 1 ] && hostarch=$1 || hostarch=`uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/` + header_dir=tools/include/uapi/asm-generic + arch_header_dir=tools/arch/${hostarch}/include/uapi/asm +else + header_dir=$1 + arch_header_dir=$2 +fi + +arch_mman=${arch_header_dir}/mman.h + +# those in egrep -vw are flags, we want just the bits + +printf "static const char *mmap_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MAP_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +egrep -q $regex ${arch_mman} && \ +(egrep $regex ${arch_mman} | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n") +egrep -q '#[[:space:]]*include[[:space:]]+.*' ${arch_mman} && +(egrep $regex ${header_dir}/mman.h | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n") +printf "};\n" diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c index d929ad7dd97b..6879d36d3004 100644 --- a/tools/perf/trace/beauty/mode_t.c +++ b/tools/perf/trace/beauty/mode_t.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #include #include diff --git a/tools/perf/trace/beauty/mount_flags.c b/tools/perf/trace/beauty/mount_flags.c new file mode 100644 index 000000000000..712935c6620a --- /dev/null +++ b/tools/perf/trace/beauty/mount_flags.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * trace/beauty/mount_flags.c + * + * Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo + */ + +#include "trace/beauty/beauty.h" +#include +#include +#include +#include + +static size_t mount__scnprintf_flags(unsigned long flags, char *bf, size_t size) +{ +#include "trace/beauty/generated/mount_flags_array.c" + static DEFINE_STRARRAY(mount_flags); + + return strarray__scnprintf_flags(&strarray__mount_flags, bf, size, flags); +} + +unsigned long syscall_arg__mask_val_mount_flags(struct syscall_arg *arg __maybe_unused, unsigned long flags) +{ + // do_mount in fs/namespace.c: + /* + * Pre-0.97 versions of mount() didn't have a flags word. When the + * flags word was introduced its top half was required to have the + * magic value 0xC0ED, and this remained so until 2.4.0-test9. + * Therefore, if this magic number is present, it carries no + * information and must be discarded. + */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + return flags; +} + +size_t syscall_arg__scnprintf_mount_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + unsigned long flags = arg->val; + + return mount__scnprintf_flags(flags, bf, size); +} diff --git a/tools/perf/trace/beauty/mount_flags.sh b/tools/perf/trace/beauty/mount_flags.sh new file mode 100755 index 000000000000..45547573a1db --- /dev/null +++ b/tools/perf/trace/beauty/mount_flags.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ + +printf "static const char *mount_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*' +egrep $regex ${header_dir}/fs.h | egrep -v '(MSK|VERBOSE|MGC_VAL)\>' | \ + sed -r "s/$regex/\2 \2 \1/g" | sort -n | \ + xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+\(1<<([[:digit:]]+)\)[[:space:]]*.*' +egrep $regex ${header_dir}/fs.h | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[%s + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c index c064d6aae659..1b9d6306d274 100644 --- a/tools/perf/trace/beauty/msg_flags.c +++ b/tools/perf/trace/beauty/msg_flags.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #include diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c index 6aec6178a99d..cc673fec9184 100644 --- a/tools/perf/trace/beauty/open_flags.c +++ b/tools/perf/trace/beauty/open_flags.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #include #include diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c index 2bafd7c995ff..981185c1974b 100644 --- a/tools/perf/trace/beauty/perf_event_open.c +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef PERF_FLAG_FD_NO_GROUP # define PERF_FLAG_FD_NO_GROUP (1UL << 0) #endif diff --git a/tools/perf/trace/beauty/perf_ioctl.sh b/tools/perf/trace/beauty/perf_ioctl.sh index 6492c74df928..9aabd9743ef6 100755 --- a/tools/perf/trace/beauty/perf_ioctl.sh +++ b/tools/perf/trace/beauty/perf_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c index 0313df342830..1a6acc46807b 100644 --- a/tools/perf/trace/beauty/pid.c +++ b/tools/perf/trace/beauty/pid.c @@ -1,4 +1,5 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 + size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg) { int pid = arg->val; diff --git a/tools/perf/trace/beauty/pkey_alloc.c b/tools/perf/trace/beauty/pkey_alloc.c index 2ba784a3734a..1b8ed4cac815 100644 --- a/tools/perf/trace/beauty/pkey_alloc.c +++ b/tools/perf/trace/beauty/pkey_alloc.c @@ -1,40 +1,36 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/pkey_alloc.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" #include #include -static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, size_t size) +size_t strarray__scnprintf_flags(struct strarray *sa, char *bf, size_t size, unsigned long flags) { int i, printed = 0; -#include "trace/beauty/generated/pkey_alloc_access_rights_array.c" - static DEFINE_STRARRAY(pkey_alloc_access_rights); - - if (access_rights == 0) { - const char *s = strarray__pkey_alloc_access_rights.entries[0]; + if (flags == 0) { + const char *s = sa->entries[0]; if (s) return scnprintf(bf, size, "%s", s); return scnprintf(bf, size, "%d", 0); } - for (i = 1; i < strarray__pkey_alloc_access_rights.nr_entries; ++i) { - int bit = 1 << (i - 1); + for (i = 1; i < sa->nr_entries; ++i) { + unsigned long bit = 1UL << (i - 1); - if (!(access_rights & bit)) + if (!(flags & bit)) continue; if (printed != 0) printed += scnprintf(bf + printed, size - printed, "|"); - if (strarray__pkey_alloc_access_rights.entries[i] != NULL) - printed += scnprintf(bf + printed, size - printed, "%s", strarray__pkey_alloc_access_rights.entries[i]); + if (sa->entries[i] != NULL) + printed += scnprintf(bf + printed, size - printed, "%s", sa->entries[i]); else printed += scnprintf(bf + printed, size - printed, "0x%#", bit); } @@ -42,6 +38,14 @@ static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, s return printed; } +static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, size_t size) +{ +#include "trace/beauty/generated/pkey_alloc_access_rights_array.c" + static DEFINE_STRARRAY(pkey_alloc_access_rights); + + return strarray__scnprintf_flags(&strarray__pkey_alloc_access_rights, bf, size, access_rights); +} + size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg) { unsigned long cmd = arg->val; diff --git a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh index e0a51aeb20b2..f8f1b560cf8a 100755 --- a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh +++ b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/asm-generic/ diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c index 246130dad6c4..be7a5d395975 100644 --- a/tools/perf/trace/beauty/prctl.c +++ b/tools/perf/trace/beauty/prctl.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/prctl.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index f24722146ebe..d32f8f1124af 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c index ba5096ae76b6..48f2b5c9aa3e 100644 --- a/tools/perf/trace/beauty/sched_policy.c +++ b/tools/perf/trace/beauty/sched_policy.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include /* diff --git a/tools/perf/trace/beauty/seccomp.c b/tools/perf/trace/beauty/seccomp.c index b7097fd5fed9..e36156b19c70 100644 --- a/tools/perf/trace/beauty/seccomp.c +++ b/tools/perf/trace/beauty/seccomp.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #ifndef SECCOMP_SET_MODE_STRICT #define SECCOMP_SET_MODE_STRICT 0 #endif diff --git a/tools/perf/trace/beauty/signum.c b/tools/perf/trace/beauty/signum.c index bde18a53f090..587fec545b8a 100644 --- a/tools/perf/trace/beauty/signum.c +++ b/tools/perf/trace/beauty/signum.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg) diff --git a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh index eb511bb5fbd3..e0803b957593 100755 --- a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ diff --git a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh index 6818392968b2..7a464a7bf913 100755 --- a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ diff --git a/tools/perf/trace/beauty/sockaddr.c b/tools/perf/trace/beauty/sockaddr.c new file mode 100644 index 000000000000..9410ad230f10 --- /dev/null +++ b/tools/perf/trace/beauty/sockaddr.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: LGPL-2.1 +// Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo + +#include "trace/beauty/beauty.h" +#include +#include +#include +#include + +static const char *socket_families[] = { + "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", + "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", + "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", + "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", + "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", + "ALG", "NFC", "VSOCK", +}; +DEFINE_STRARRAY(socket_families); + +static size_t af_inet__scnprintf(struct sockaddr *sa, char *bf, size_t size) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + char tmp[16]; + return scnprintf(bf, size, ", port: %d, addr: %s", ntohs(sin->sin_port), + inet_ntop(sin->sin_family, &sin->sin_addr, tmp, sizeof(tmp))); +} + +static size_t af_inet6__scnprintf(struct sockaddr *sa, char *bf, size_t size) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + u32 flowinfo = ntohl(sin6->sin6_flowinfo); + char tmp[512]; + size_t printed = scnprintf(bf, size, ", port: %d, addr: %s", ntohs(sin6->sin6_port), + inet_ntop(sin6->sin6_family, &sin6->sin6_addr, tmp, sizeof(tmp))); + if (flowinfo != 0) + printed += scnprintf(bf + printed, size - printed, ", flowinfo: %lu", flowinfo); + if (sin6->sin6_scope_id != 0) + printed += scnprintf(bf + printed, size - printed, ", scope_id: %lu", sin6->sin6_scope_id); + + return printed; +} + +static size_t af_local__scnprintf(struct sockaddr *sa, char *bf, size_t size) +{ + struct sockaddr_un *sun = (struct sockaddr_un *)sa; + return scnprintf(bf, size, ", path: %s", sun->sun_path); +} + +static size_t (*af_scnprintfs[])(struct sockaddr *sa, char *bf, size_t size) = { + [AF_LOCAL] = af_local__scnprintf, + [AF_INET] = af_inet__scnprintf, + [AF_INET6] = af_inet6__scnprintf, +}; + +static size_t syscall_arg__scnprintf_augmented_sockaddr(struct syscall_arg *arg, char *bf, size_t size) +{ + struct sockaddr *sa = (struct sockaddr *)arg->augmented.args; + char family[32]; + size_t printed; + + strarray__scnprintf(&strarray__socket_families, family, sizeof(family), "%d", sa->sa_family); + printed = scnprintf(bf, size, "{ .family: %s", family); + + if (sa->sa_family < ARRAY_SIZE(af_scnprintfs) && af_scnprintfs[sa->sa_family]) + printed += af_scnprintfs[sa->sa_family](sa, bf + printed, size - printed); + + return printed + scnprintf(bf + printed, size - printed, " }"); +} + +size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg) +{ + if (arg->augmented.args) + return syscall_arg__scnprintf_augmented_sockaddr(arg, bf, size); + + return scnprintf(bf, size, "%#x", arg->val); +} diff --git a/tools/perf/trace/beauty/socket.c b/tools/perf/trace/beauty/socket.c index 65227269384b..d971a2596417 100644 --- a/tools/perf/trace/beauty/socket.c +++ b/tools/perf/trace/beauty/socket.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/socket.c * diff --git a/tools/perf/trace/beauty/socket_ipproto.sh b/tools/perf/trace/beauty/socket_ipproto.sh index a3cc24633bec..de0f2f29017f 100755 --- a/tools/perf/trace/beauty/socket_ipproto.sh +++ b/tools/perf/trace/beauty/socket_ipproto.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c index bca26aef4a77..a63a9a332aa0 100644 --- a/tools/perf/trace/beauty/socket_type.c +++ b/tools/perf/trace/beauty/socket_type.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #include diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index 5643b692af4c..630f2760dd66 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * trace/beauty/statx.c * * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo - * - * Released under the GPL v2. (and only v2, not any later version) */ #include "trace/beauty/beauty.h" diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh index 0f6a5197d0be..439773daaf77 100755 --- a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh +++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c index 8465281a093d..42ff58ad613b 100644 --- a/tools/perf/trace/beauty/waitid_options.c +++ b/tools/perf/trace/beauty/waitid_options.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: LGPL-2.1 #include #include diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 7efe15b9618d..ecd9f9ceda77 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -73,6 +73,7 @@ libperf-y += vdso.o libperf-y += counts.o libperf-y += stat.o libperf-y += stat-shadow.o +libperf-y += stat-display.o libperf-y += record.o libperf-y += srcline.o libperf-y += data.o diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 28cd6a17491b..6936daf89ddd 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -139,6 +139,7 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i #include "arch/x86/annotate/instructions.c" #include "arch/powerpc/annotate/instructions.c" #include "arch/s390/annotate/instructions.c" +#include "arch/sparc/annotate/instructions.c" static struct arch architectures[] = { { @@ -170,6 +171,13 @@ static struct arch architectures[] = { .comment_char = '#', }, }, + { + .name = "sparc", + .init = sparc__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, }; static void ins__delete(struct ins_operands *ops) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index db1511359c5e..72d5ba2479bf 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -906,9 +906,8 @@ out_free: return err; } -int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_auxtrace_info(struct perf_session *session, + union perf_event *event) { enum auxtrace_type type = event->auxtrace_info.type; @@ -932,9 +931,8 @@ int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, } } -s64 perf_event__process_auxtrace(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session) +s64 perf_event__process_auxtrace(struct perf_session *session, + union perf_event *event) { s64 err; @@ -950,7 +948,7 @@ s64 perf_event__process_auxtrace(struct perf_tool *tool, if (!session->auxtrace || event->header.type != PERF_RECORD_AUXTRACE) return -EINVAL; - err = session->auxtrace->process_auxtrace_event(session, event, tool); + err = session->auxtrace->process_auxtrace_event(session, event, session->tool); if (err < 0) return err; @@ -964,16 +962,23 @@ s64 perf_event__process_auxtrace(struct perf_tool *tool, #define PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ 64 #define PERF_ITRACE_MAX_LAST_BRANCH_SZ 1024 -void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts) +void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts, + bool no_sample) { - synth_opts->instructions = true; synth_opts->branches = true; synth_opts->transactions = true; synth_opts->ptwrites = true; synth_opts->pwr_events = true; synth_opts->errors = true; - synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; - synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; + if (no_sample) { + synth_opts->period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; + synth_opts->period = 1; + synth_opts->calls = true; + } else { + synth_opts->instructions = true; + synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; + synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; + } synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ; synth_opts->initial_skip = 0; @@ -1001,7 +1006,7 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, } if (!str) { - itrace_synth_opts__set_default(synth_opts); + itrace_synth_opts__set_default(synth_opts, false); return 0; } @@ -1185,9 +1190,8 @@ void events_stats__auxtrace_error_warn(const struct events_stats *stats) } } -int perf_event__process_auxtrace_error(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_auxtrace_error(struct perf_session *session, + union perf_event *event) { if (auxtrace__dont_decode(session)) return 0; @@ -1196,11 +1200,12 @@ int perf_event__process_auxtrace_error(struct perf_tool *tool __maybe_unused, return 0; } -static int __auxtrace_mmap__read(struct auxtrace_mmap *mm, +static int __auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, bool snapshot, size_t snapshot_size) { + struct auxtrace_mmap *mm = &map->auxtrace_mmap; u64 head, old = mm->prev, offset, ref; unsigned char *data = mm->base; size_t size, head_off, old_off, len1, len2, padding; @@ -1287,7 +1292,7 @@ static int __auxtrace_mmap__read(struct auxtrace_mmap *mm, ev.auxtrace.tid = mm->tid; ev.auxtrace.cpu = mm->cpu; - if (fn(tool, &ev, data1, len1, data2, len2)) + if (fn(tool, map, &ev, data1, len1, data2, len2)) return -1; mm->prev = head; @@ -1306,18 +1311,18 @@ static int __auxtrace_mmap__read(struct auxtrace_mmap *mm, return 1; } -int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr, +int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn) { - return __auxtrace_mmap__read(mm, itr, tool, fn, false, 0); + return __auxtrace_mmap__read(map, itr, tool, fn, false, 0); } -int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm, +int auxtrace_mmap__read_snapshot(struct perf_mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size) { - return __auxtrace_mmap__read(mm, itr, tool, fn, true, snapshot_size); + return __auxtrace_mmap__read(map, itr, tool, fn, true, snapshot_size); } /** diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 71fc3bd74299..8e50f96d4b23 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "../perf.h" #include "event.h" @@ -33,6 +34,7 @@ union perf_event; struct perf_session; struct perf_evlist; struct perf_tool; +struct perf_mmap; struct option; struct record_opts; struct auxtrace_info_event; @@ -56,6 +58,7 @@ enum itrace_period_type { /** * struct itrace_synth_opts - AUX area tracing synthesis options. * @set: indicates whether or not options have been set + * @default_no_sample: Default to no sampling. * @inject: indicates the event (not just the sample) must be fully synthesized * because 'perf inject' will write it out * @instructions: whether to synthesize 'instructions' events @@ -80,6 +83,7 @@ enum itrace_period_type { */ struct itrace_synth_opts { bool set; + bool default_no_sample; bool inject; bool instructions; bool branches; @@ -434,13 +438,14 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, bool per_cpu); typedef int (*process_auxtrace_t)(struct perf_tool *tool, + struct perf_mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2); -int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr, +int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn); -int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm, +int auxtrace_mmap__read_snapshot(struct perf_mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size); @@ -517,18 +522,16 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool, struct perf_session *session, perf_event__handler_t process); -int perf_event__process_auxtrace_info(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); -s64 perf_event__process_auxtrace(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); -int perf_event__process_auxtrace_error(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_auxtrace_info(struct perf_session *session, + union perf_event *event); +s64 perf_event__process_auxtrace(struct perf_session *session, + union perf_event *event); +int perf_event__process_auxtrace_error(struct perf_session *session, + union perf_event *event); int itrace_parse_synth_opts(const struct option *opt, const char *str, int unset); -void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts); +void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts, + bool no_sample); size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp); void perf_session__auxtrace_error_inc(struct perf_session *session, @@ -577,6 +580,23 @@ static inline void auxtrace__free(struct perf_session *session) return session->auxtrace->free(session); } +#define ITRACE_HELP \ +" i: synthesize instructions events\n" \ +" b: synthesize branches events\n" \ +" c: synthesize branches events (calls only)\n" \ +" r: synthesize branches events (returns only)\n" \ +" x: synthesize transactions events\n" \ +" w: synthesize ptwrite events\n" \ +" p: synthesize power events\n" \ +" e: synthesize error events\n" \ +" d: create a debug log\n" \ +" g[len]: synthesize a call chain (use with i or x)\n" \ +" l[len]: synthesize last branch entries (use with i or x)\n" \ +" sNUMBER: skip initial number of events\n" \ +" PERIOD[ns|us|ms|i|t]: specify period to sample stream\n" \ +" concatenate multiple options. Default is ibxwpe or cewp\n" + + #else static inline struct auxtrace_record * @@ -717,6 +737,8 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, struct perf_evlist *evlist, int idx, bool per_cpu); +#define ITRACE_HELP "" + #endif #endif diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 47aac41349a2..f9ae1a993806 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -1615,7 +1615,7 @@ struct perf_evsel *bpf__setup_output_event(struct perf_evlist *evlist, const cha int bpf__setup_stdout(struct perf_evlist *evlist) { struct perf_evsel *evsel = bpf__setup_output_event(evlist, "__bpf_stdout__"); - return IS_ERR(evsel) ? PTR_ERR(evsel) : 0; + return PTR_ERR_OR_ZERO(evsel); } #define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START) diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 2ae640257fdb..73430b73570d 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -244,6 +244,27 @@ static void cs_etm__free(struct perf_session *session) zfree(&aux); } +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) +{ + struct machine *machine; + + machine = etmq->etm->machine; + + if (address >= etmq->etm->kernel_start) { + if (machine__is_host(machine)) + return PERF_RECORD_MISC_KERNEL; + else + return PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (machine__is_host(machine)) + return PERF_RECORD_MISC_USER; + else if (perf_guest) + return PERF_RECORD_MISC_GUEST_USER; + else + return PERF_RECORD_MISC_HYPERVISOR; + } +} + static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u64 address, size_t size, u8 *buffer) { @@ -258,10 +279,7 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u64 address, return -1; machine = etmq->etm->machine; - if (address >= etmq->etm->kernel_start) - cpumode = PERF_RECORD_MISC_KERNEL; - else - cpumode = PERF_RECORD_MISC_USER; + cpumode = cs_etm__cpu_mode(etmq, address); thread = etmq->thread; if (!thread) { @@ -653,7 +671,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, struct perf_sample sample = {.ip = 0,}; event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr); event->sample.header.size = sizeof(struct perf_event_header); sample.ip = addr; @@ -665,7 +683,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, sample.cpu = etmq->packet->cpu; sample.flags = 0; sample.insn_len = 1; - sample.cpumode = event->header.misc; + sample.cpumode = event->sample.header.misc; if (etm->synth_opts.last_branch) { cs_etm__copy_last_branch_rb(etmq); @@ -706,12 +724,15 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq) u64 nr; struct branch_entry entries; } dummy_bs; + u64 ip; + + ip = cs_etm__last_executed_instr(etmq->prev_packet); event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip); event->sample.header.size = sizeof(struct perf_event_header); - sample.ip = cs_etm__last_executed_instr(etmq->prev_packet); + sample.ip = ip; sample.pid = etmq->pid; sample.tid = etmq->tid; sample.addr = cs_etm__first_executed_instr(etmq->packet); @@ -720,7 +741,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq) sample.period = 1; sample.cpu = etmq->packet->cpu; sample.flags = 0; - sample.cpumode = PERF_RECORD_MISC_USER; + sample.cpumode = event->sample.header.misc; /* * perf report cannot handle events without a branch stack @@ -1432,7 +1453,8 @@ int cs_etm__process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { etm->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&etm->synth_opts); + itrace_synth_opts__set_default(&etm->synth_opts, + session->itrace_synth_opts->default_no_sample); etm->synth_opts.callchain = false; } diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index abd38abf1d91..2a36fab76994 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -182,20 +182,20 @@ err_put_field: } static struct bt_ctf_field_type* -get_tracepoint_field_type(struct ctf_writer *cw, struct format_field *field) +get_tracepoint_field_type(struct ctf_writer *cw, struct tep_format_field *field) { unsigned long flags = field->flags; - if (flags & FIELD_IS_STRING) + if (flags & TEP_FIELD_IS_STRING) return cw->data.string; - if (!(flags & FIELD_IS_SIGNED)) { + if (!(flags & TEP_FIELD_IS_SIGNED)) { /* unsigned long are mostly pointers */ - if (flags & FIELD_IS_LONG || flags & FIELD_IS_POINTER) + if (flags & TEP_FIELD_IS_LONG || flags & TEP_FIELD_IS_POINTER) return cw->data.u64_hex; } - if (flags & FIELD_IS_SIGNED) { + if (flags & TEP_FIELD_IS_SIGNED) { if (field->size == 8) return cw->data.s64; else @@ -287,7 +287,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, struct bt_ctf_event_class *event_class, struct bt_ctf_event *event, struct perf_sample *sample, - struct format_field *fmtf) + struct tep_format_field *fmtf) { struct bt_ctf_field_type *type; struct bt_ctf_field *array_field; @@ -304,10 +304,10 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, name = fmtf->alias; offset = fmtf->offset; len = fmtf->size; - if (flags & FIELD_IS_STRING) - flags &= ~FIELD_IS_ARRAY; + if (flags & TEP_FIELD_IS_STRING) + flags &= ~TEP_FIELD_IS_ARRAY; - if (flags & FIELD_IS_DYNAMIC) { + if (flags & TEP_FIELD_IS_DYNAMIC) { unsigned long long tmp_val; tmp_val = tep_read_number(fmtf->event->pevent, @@ -317,7 +317,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, offset &= 0xffff; } - if (flags & FIELD_IS_ARRAY) { + if (flags & TEP_FIELD_IS_ARRAY) { type = bt_ctf_event_class_get_field_by_name( event_class, name); @@ -338,7 +338,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, type = get_tracepoint_field_type(cw, fmtf); for (i = 0; i < n_items; i++) { - if (flags & FIELD_IS_ARRAY) + if (flags & TEP_FIELD_IS_ARRAY) field = bt_ctf_field_array_get_field(array_field, i); else field = bt_ctf_field_create(type); @@ -348,7 +348,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, return -1; } - if (flags & FIELD_IS_STRING) + if (flags & TEP_FIELD_IS_STRING) ret = string_set_value(field, data + offset + i * len); else { unsigned long long value_int; @@ -357,7 +357,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, fmtf->event->pevent, data + offset + i * len, len); - if (!(flags & FIELD_IS_SIGNED)) + if (!(flags & TEP_FIELD_IS_SIGNED)) ret = bt_ctf_field_unsigned_integer_set_value( field, value_int); else @@ -369,7 +369,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, pr_err("failed to set file value %s\n", name); goto err_put_field; } - if (!(flags & FIELD_IS_ARRAY)) { + if (!(flags & TEP_FIELD_IS_ARRAY)) { ret = bt_ctf_event_set_payload(event, name, field); if (ret) { pr_err("failed to set payload %s\n", name); @@ -378,7 +378,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, } bt_ctf_field_put(field); } - if (flags & FIELD_IS_ARRAY) { + if (flags & TEP_FIELD_IS_ARRAY) { ret = bt_ctf_event_set_payload(event, name, array_field); if (ret) { pr_err("Failed add payload array %s\n", name); @@ -396,10 +396,10 @@ err_put_field: static int add_tracepoint_fields_values(struct ctf_writer *cw, struct bt_ctf_event_class *event_class, struct bt_ctf_event *event, - struct format_field *fields, + struct tep_format_field *fields, struct perf_sample *sample) { - struct format_field *field; + struct tep_format_field *field; int ret; for (field = fields; field; field = field->next) { @@ -417,8 +417,8 @@ static int add_tracepoint_values(struct ctf_writer *cw, struct perf_evsel *evsel, struct perf_sample *sample) { - struct format_field *common_fields = evsel->tp_format->format.common_fields; - struct format_field *fields = evsel->tp_format->format.fields; + struct tep_format_field *common_fields = evsel->tp_format->format.common_fields; + struct tep_format_field *fields = evsel->tp_format->format.fields; int ret; ret = add_tracepoint_fields_values(cw, event_class, event, @@ -970,7 +970,7 @@ out: static int event_class_add_field(struct bt_ctf_event_class *event_class, struct bt_ctf_field_type *type, - struct format_field *field) + struct tep_format_field *field) { struct bt_ctf_field_type *t = NULL; char *name; @@ -1009,10 +1009,10 @@ static int event_class_add_field(struct bt_ctf_event_class *event_class, } static int add_tracepoint_fields_types(struct ctf_writer *cw, - struct format_field *fields, + struct tep_format_field *fields, struct bt_ctf_event_class *event_class) { - struct format_field *field; + struct tep_format_field *field; int ret; for (field = fields; field; field = field->next) { @@ -1030,15 +1030,15 @@ static int add_tracepoint_fields_types(struct ctf_writer *cw, * type and don't care that it is an array. What we don't * support is an array of strings. */ - if (flags & FIELD_IS_STRING) - flags &= ~FIELD_IS_ARRAY; + if (flags & TEP_FIELD_IS_STRING) + flags &= ~TEP_FIELD_IS_ARRAY; - if (flags & FIELD_IS_ARRAY) + if (flags & TEP_FIELD_IS_ARRAY) type = bt_ctf_field_type_array_create(type, field->arraylen); ret = event_class_add_field(event_class, type, field); - if (flags & FIELD_IS_ARRAY) + if (flags & TEP_FIELD_IS_ARRAY) bt_ctf_field_type_put(type); if (ret) { @@ -1055,8 +1055,8 @@ static int add_tracepoint_types(struct ctf_writer *cw, struct perf_evsel *evsel, struct bt_ctf_event_class *class) { - struct format_field *common_fields = evsel->tp_format->format.common_fields; - struct format_field *fields = evsel->tp_format->format.fields; + struct tep_format_field *common_fields = evsel->tp_format->format.common_fields; + struct tep_format_field *fields = evsel->tp_format->format.fields; int ret; ret = add_tracepoint_fields_types(cw, common_fields, class); @@ -1578,7 +1578,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, { struct perf_session *session; struct perf_data data = { - .file.path = input, + .file = { .path = input, .fd = -1 }, .mode = PERF_DATA_MODE_READ, .force = opts->force, }; diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 7123746edcf4..69fbb0a72d0c 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -463,6 +463,28 @@ int db_export__branch_types(struct db_export *dbe) if (err) break; } + + /* Add trace begin / end variants */ + for (i = 0; branch_types[i].name ; i++) { + const char *name = branch_types[i].name; + u32 type = branch_types[i].branch_type; + char buf[64]; + + if (type == PERF_IP_FLAG_BRANCH || + (type & (PERF_IP_FLAG_TRACE_BEGIN | PERF_IP_FLAG_TRACE_END))) + continue; + + snprintf(buf, sizeof(buf), "trace begin / %s", name); + err = db_export__branch_type(dbe, type | PERF_IP_FLAG_TRACE_BEGIN, buf); + if (err) + break; + + snprintf(buf, sizeof(buf), "%s / trace end", name); + err = db_export__branch_type(dbe, type | PERF_IP_FLAG_TRACE_END, buf); + if (err) + break; + } + return err; } diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 1f3ccc368530..d01b8355f4ca 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -63,6 +63,7 @@ struct perf_env { struct numa_node *numa_nodes; struct memory_node *memory_nodes; unsigned long long memory_bsize; + u64 clockid_res_ns; }; extern struct perf_env perf_env; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 0cd42150f712..e9c108a6b1c3 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -308,6 +308,7 @@ static int perf_event__synthesize_fork(struct perf_tool *tool, event->fork.pid = tgid; event->fork.tid = pid; event->fork.header.type = PERF_RECORD_FORK; + event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC; event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size); @@ -1081,6 +1082,7 @@ void *cpu_map_data__alloc(struct cpu_map *map, size_t *size, u16 *type, int *max } *size += sizeof(struct cpu_map_data); + *size = PERF_ALIGN(*size, sizeof(u64)); return zalloc(*size); } @@ -1560,26 +1562,9 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, return NULL; } -try_again: + al->map = map_groups__find(mg, al->addr); - if (al->map == NULL) { - /* - * If this is outside of all known maps, and is a negative - * address, try to look it up in the kernel dso, as it might be - * a vsyscall or vdso (which executes in user-mode). - * - * XXX This is nasty, we should have a symbol list in the - * "[vdso]" dso, but for now lets use the old trick of looking - * in the whole kernel symbol list. - */ - if (cpumode == PERF_RECORD_MISC_USER && machine && - mg != &machine->kmaps && - machine__kernel_ip(machine, al->addr)) { - mg = &machine->kmaps; - load_map = true; - goto try_again; - } - } else { + if (al->map != NULL) { /* * Kernel maps might be changed when loading symbols so loading * must be done prior to using kernel maps. diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index be440df29615..668d2a9ef0f4 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -358,7 +358,7 @@ void perf_evlist__disable(struct perf_evlist *evlist) struct perf_evsel *pos; evlist__for_each_entry(evlist, pos) { - if (!perf_evsel__is_group_leader(pos) || !pos->fd) + if (pos->disabled || !perf_evsel__is_group_leader(pos) || !pos->fd) continue; perf_evsel__disable(pos); } @@ -1810,3 +1810,30 @@ void perf_evlist__force_leader(struct perf_evlist *evlist) leader->forced_leader = true; } } + +struct perf_evsel *perf_evlist__reset_weak_group(struct perf_evlist *evsel_list, + struct perf_evsel *evsel) +{ + struct perf_evsel *c2, *leader; + bool is_open = true; + + leader = evsel->leader; + pr_debug("Weak group for %s/%d failed\n", + leader->name, leader->nr_members); + + /* + * for_each_group_member doesn't work here because it doesn't + * include the first entry. + */ + evlist__for_each_entry(evsel_list, c2) { + if (c2 == evsel) + is_open = false; + if (c2->leader == leader) { + if (is_open) + perf_evsel__close(c2); + c2->leader = c2; + c2->nr_members = 0; + } + } + return leader; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index dc66436add98..9919eed6d15b 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -312,4 +312,7 @@ bool perf_evlist__exclude_kernel(struct perf_evlist *evlist); void perf_evlist__force_leader(struct perf_evlist *evlist); +struct perf_evsel *perf_evlist__reset_weak_group(struct perf_evlist *evlist, + struct perf_evsel *evsel); + #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1a61628a1c12..d37bb1566cd9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -232,6 +232,7 @@ void perf_evsel__init(struct perf_evsel *evsel, evsel->leader = evsel; evsel->unit = ""; evsel->scale = 1.0; + evsel->max_events = ULONG_MAX; evsel->evlist = NULL; evsel->bpf_fd = -1; INIT_LIST_HEAD(&evsel->node); @@ -793,6 +794,9 @@ static void apply_config_terms(struct perf_evsel *evsel, case PERF_EVSEL__CONFIG_TERM_MAX_STACK: max_stack = term->val.max_stack; break; + case PERF_EVSEL__CONFIG_TERM_MAX_EVENTS: + evsel->max_events = term->val.max_events; + break; case PERF_EVSEL__CONFIG_TERM_INHERIT: /* * attr->inherit should has already been set by @@ -952,7 +956,6 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, attr->sample_freq = 0; attr->sample_period = 0; attr->write_backward = 0; - attr->sample_id_all = 0; } if (opts->no_samples) @@ -1089,6 +1092,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, attr->exclude_user = 1; } + if (evsel->own_cpus) + evsel->attr.read_format |= PERF_FORMAT_ID; + /* * Apply event specific term settings, * it overloads any global configuration. @@ -1200,16 +1206,27 @@ int perf_evsel__append_addr_filter(struct perf_evsel *evsel, const char *filter) int perf_evsel__enable(struct perf_evsel *evsel) { - return perf_evsel__run_ioctl(evsel, - PERF_EVENT_IOC_ENABLE, - 0); + int err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, 0); + + if (!err) + evsel->disabled = false; + + return err; } int perf_evsel__disable(struct perf_evsel *evsel) { - return perf_evsel__run_ioctl(evsel, - PERF_EVENT_IOC_DISABLE, - 0); + int err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_DISABLE, 0); + /* + * We mark it disabled here so that tools that disable a event can + * ignore events after they disable it. I.e. the ring buffer may have + * already a few more events queued up before the kernel got the stop + * request. + */ + if (!err) + evsel->disabled = true; + + return err; } int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) @@ -2682,7 +2699,7 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, return 0; } -struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name) +struct tep_format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name) { return tep_find_field(evsel->tp_format, name); } @@ -2690,7 +2707,7 @@ struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *nam void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample, const char *name) { - struct format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = perf_evsel__field(evsel, name); int offset; if (!field) @@ -2698,7 +2715,7 @@ void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample, offset = field->offset; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { offset = *(int *)(sample->raw_data + field->offset); offset &= 0xffff; } @@ -2706,7 +2723,7 @@ void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample, return sample->raw_data + offset; } -u64 format_field__intval(struct format_field *field, struct perf_sample *sample, +u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sample, bool needs_swap) { u64 value; @@ -2748,7 +2765,7 @@ u64 format_field__intval(struct format_field *field, struct perf_sample *sample, u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, const char *name) { - struct format_field *field = perf_evsel__field(evsel, name); + struct tep_format_field *field = perf_evsel__field(evsel, name); if (!field) return 0; @@ -2940,3 +2957,32 @@ struct perf_env *perf_evsel__env(struct perf_evsel *evsel) return evsel->evlist->env; return NULL; } + +static int store_evsel_ids(struct perf_evsel *evsel, struct perf_evlist *evlist) +{ + int cpu, thread; + + for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) { + for (thread = 0; thread < xyarray__max_y(evsel->fd); + thread++) { + int fd = FD(evsel, cpu, thread); + + if (perf_evlist__id_add_fd(evlist, evsel, + cpu, thread, fd) < 0) + return -1; + } + } + + return 0; +} + +int perf_evsel__store_ids(struct perf_evsel *evsel, struct perf_evlist *evlist) +{ + struct cpu_map *cpus = evsel->cpus; + struct thread_map *threads = evsel->threads; + + if (perf_evsel__alloc_id(evsel, cpus->nr, threads->nr)) + return -ENOMEM; + + return store_evsel_ids(evsel, evlist); +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 163c960614d3..3147ca76c6fc 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -46,6 +46,7 @@ enum term_type { PERF_EVSEL__CONFIG_TERM_STACK_USER, PERF_EVSEL__CONFIG_TERM_INHERIT, PERF_EVSEL__CONFIG_TERM_MAX_STACK, + PERF_EVSEL__CONFIG_TERM_MAX_EVENTS, PERF_EVSEL__CONFIG_TERM_OVERWRITE, PERF_EVSEL__CONFIG_TERM_DRV_CFG, PERF_EVSEL__CONFIG_TERM_BRANCH, @@ -65,6 +66,7 @@ struct perf_evsel_config_term { bool inherit; bool overwrite; char *branch; + unsigned long max_events; } val; bool weak; }; @@ -99,10 +101,12 @@ struct perf_evsel { struct perf_counts *prev_raw_counts; int idx; u32 ids; + unsigned long max_events; + unsigned long nr_events_printed; char *name; double scale; const char *unit; - struct event_format *tp_format; + struct tep_event_format *tp_format; off_t id_offset; struct perf_stat_evsel *stats; void *priv; @@ -119,6 +123,7 @@ struct perf_evsel { bool snapshot; bool supported; bool needs_swap; + bool disabled; bool no_aux_samples; bool immediate; bool system_wide; @@ -211,7 +216,7 @@ static inline struct perf_evsel *perf_evsel__newtp(const char *sys, const char * struct perf_evsel *perf_evsel__new_cycles(bool precise); -struct event_format *event_format__new(const char *sys, const char *name); +struct tep_event_format *event_format__new(const char *sys, const char *name); void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx); @@ -296,11 +301,11 @@ static inline char *perf_evsel__strval(struct perf_evsel *evsel, return perf_evsel__rawptr(evsel, sample, name); } -struct format_field; +struct tep_format_field; -u64 format_field__intval(struct format_field *field, struct perf_sample *sample, bool needs_swap); +u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sample, bool needs_swap); -struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name); +struct tep_format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name); #define perf_evsel__match(evsel, t, c) \ (evsel->attr.type == PERF_TYPE_##t && \ @@ -481,4 +486,5 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, struct perf_env *perf_evsel__env(struct perf_evsel *evsel); +int perf_evsel__store_ids(struct perf_evsel *evsel, struct perf_evlist *evlist); #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 06dfb027879d..0d0a4c6f368b 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -73,7 +73,7 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, } if (details->trace_fields) { - struct format_field *field; + struct tep_format_field *field; if (evsel->attr.type != PERF_TYPE_TRACEPOINT) { printed += comma_fprintf(fp, &first, " (not a tracepoint)"); diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index de322d51c7fe..b72440bf9a79 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -29,6 +29,12 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__powerpc__) #define GEN_ELF_ARCH EM_PPC #define GEN_ELF_CLASS ELFCLASS32 +#elif defined(__sparc__) && defined(__arch64__) +#define GEN_ELF_ARCH EM_SPARCV9 +#define GEN_ELF_CLASS ELFCLASS64 +#elif defined(__sparc__) +#define GEN_ELF_ARCH EM_SPARC +#define GEN_ELF_CLASS ELFCLASS32 #else #error "unsupported architecture" #endif diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3cadc252dd89..4fd45be95a43 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1034,6 +1034,13 @@ static int write_auxtrace(struct feat_fd *ff, return err; } +static int write_clockid(struct feat_fd *ff, + struct perf_evlist *evlist __maybe_unused) +{ + return do_write(ff, &ff->ph->env.clockid_res_ns, + sizeof(ff->ph->env.clockid_res_ns)); +} + static int cpu_cache_level__sort(const void *a, const void *b) { struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a; @@ -1508,6 +1515,12 @@ static void print_cpu_topology(struct feat_fd *ff, FILE *fp) fprintf(fp, "# Core ID and Socket ID information is not available\n"); } +static void print_clockid(struct feat_fd *ff, FILE *fp) +{ + fprintf(fp, "# clockid frequency: %"PRIu64" MHz\n", + ff->ph->env.clockid_res_ns * 1000); +} + static void free_event_desc(struct perf_evsel *events) { struct perf_evsel *evsel; @@ -2531,6 +2544,15 @@ out: return ret; } +static int process_clockid(struct feat_fd *ff, + void *data __maybe_unused) +{ + if (do_read_u64(ff, &ff->ph->env.clockid_res_ns)) + return -1; + + return 0; +} + struct feature_ops { int (*write)(struct feat_fd *ff, struct perf_evlist *evlist); void (*print)(struct feat_fd *ff, FILE *fp); @@ -2590,6 +2612,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPN(CACHE, cache, true), FEAT_OPR(SAMPLE_TIME, sample_time, false), FEAT_OPR(MEM_TOPOLOGY, mem_topology, true), + FEAT_OPR(CLOCKID, clockid, false) }; struct header_print_data { @@ -3206,7 +3229,7 @@ static int read_attr(int fd, struct perf_header *ph, static int perf_evsel__prepare_tracepoint_event(struct perf_evsel *evsel, struct tep_handle *pevent) { - struct event_format *event; + struct tep_event_format *event; char bf[128]; /* already prepared */ @@ -3448,10 +3471,10 @@ int perf_event__synthesize_features(struct perf_tool *tool, return ret; } -int perf_event__process_feature(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session __maybe_unused) +int perf_event__process_feature(struct perf_session *session, + union perf_event *event) { + struct perf_tool *tool = session->tool; struct feat_fd ff = { .fd = 0 }; struct feature_event *fe = (struct feature_event *)event; int type = fe->header.type; @@ -3637,13 +3660,13 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) } int perf_event__synthesize_attrs(struct perf_tool *tool, - struct perf_session *session, - perf_event__handler_t process) + struct perf_evlist *evlist, + perf_event__handler_t process) { struct perf_evsel *evsel; int err = 0; - evlist__for_each_entry(session->evlist, evsel) { + evlist__for_each_entry(evlist, evsel) { err = perf_event__synthesize_attr(tool, &evsel->attr, evsel->ids, evsel->id, process); if (err) { @@ -3856,9 +3879,8 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, return aligned_size; } -int perf_event__process_tracing_data(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_tracing_data(struct perf_session *session, + union perf_event *event) { ssize_t size_read, padding, size = event->tracing_data.size; int fd = perf_data__fd(session->data); @@ -3924,9 +3946,8 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, return err; } -int perf_event__process_build_id(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_build_id(struct perf_session *session, + union perf_event *event) { __event_process_build_id(&event->build_id, event->build_id.filename, diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 6d7fe44aadc0..0d553ddca0a3 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -38,6 +38,7 @@ enum { HEADER_CACHE, HEADER_SAMPLE_TIME, HEADER_MEM_TOPOLOGY, + HEADER_CLOCKID, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; @@ -116,15 +117,14 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, perf_event__handler_t process, bool is_pipe); -int perf_event__process_feature(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_feature(struct perf_session *session, + union perf_event *event); int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); int perf_event__synthesize_attrs(struct perf_tool *tool, - struct perf_session *session, + struct perf_evlist *evlist, perf_event__handler_t process); int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct perf_evsel *evsel, @@ -148,17 +148,15 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp); int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct perf_evlist *evlist, perf_event__handler_t process); -int perf_event__process_tracing_data(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_tracing_data(struct perf_session *session, + union perf_event *event); int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine); -int perf_event__process_build_id(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_build_id(struct perf_session *session, + union perf_event *event); bool is_perf_magic(u64 magic); #define NAME_ALIGN 64 diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 7f0c83b6332b..7b27d77306c2 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -269,6 +269,13 @@ static int intel_bts_do_fix_overlap(struct auxtrace_queue *queue, return 0; } +static inline u8 intel_bts_cpumode(struct intel_bts *bts, uint64_t ip) +{ + return machine__kernel_ip(bts->machine, ip) ? + PERF_RECORD_MISC_KERNEL : + PERF_RECORD_MISC_USER; +} + static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, struct branch *branch) { @@ -281,12 +288,8 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, bts->num_events++ <= bts->synth_opts.initial_skip) return 0; - event.sample.header.type = PERF_RECORD_SAMPLE; - event.sample.header.misc = PERF_RECORD_MISC_USER; - event.sample.header.size = sizeof(struct perf_event_header); - - sample.cpumode = PERF_RECORD_MISC_USER; sample.ip = le64_to_cpu(branch->from); + sample.cpumode = intel_bts_cpumode(bts, sample.ip); sample.pid = btsq->pid; sample.tid = btsq->tid; sample.addr = le64_to_cpu(branch->to); @@ -298,6 +301,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, sample.insn_len = btsq->intel_pt_insn.length; memcpy(sample.insn, btsq->intel_pt_insn.buf, INTEL_PT_INSN_BUF_SZ); + event.sample.header.type = PERF_RECORD_SAMPLE; + event.sample.header.misc = sample.cpumode; + event.sample.header.size = sizeof(struct perf_event_header); + if (bts->synth_opts.inject) { event.sample.header.size = bts->branches_event_size; ret = perf_event__synthesize_sample(&event, @@ -910,7 +917,8 @@ int intel_bts_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { bts->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&bts->synth_opts); + itrace_synth_opts__set_default(&bts->synth_opts, + session->itrace_synth_opts->default_no_sample); if (session->itrace_synth_opts) bts->synth_opts.thread_stack = session->itrace_synth_opts->thread_stack; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index d404bed7003a..4503f3ca45ab 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1165,7 +1165,7 @@ static int intel_pt_walk_tip(struct intel_pt_decoder *decoder) decoder->pge = false; decoder->continuous_period = false; decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; - decoder->state.to_ip = 0; + decoder->state.type |= INTEL_PT_TRACE_END; return 0; } if (err == INTEL_PT_RETURN) @@ -1179,9 +1179,13 @@ static int intel_pt_walk_tip(struct intel_pt_decoder *decoder) decoder->continuous_period = false; decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; decoder->state.from_ip = decoder->ip; - decoder->state.to_ip = 0; - if (decoder->packet.count != 0) + if (decoder->packet.count == 0) { + decoder->state.to_ip = 0; + } else { + decoder->state.to_ip = decoder->last_ip; decoder->ip = decoder->last_ip; + } + decoder->state.type |= INTEL_PT_TRACE_END; } else { decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; decoder->state.from_ip = decoder->ip; @@ -1208,7 +1212,8 @@ static int intel_pt_walk_tip(struct intel_pt_decoder *decoder) decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; decoder->ip = to_ip; decoder->state.from_ip = decoder->ip; - decoder->state.to_ip = 0; + decoder->state.to_ip = to_ip; + decoder->state.type |= INTEL_PT_TRACE_END; return 0; } intel_pt_log_at("ERROR: Conditional branch when expecting indirect branch", @@ -1469,6 +1474,8 @@ static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder) decoder->have_calc_cyc_to_tsc = false; intel_pt_calc_cyc_to_tsc(decoder, true); } + + intel_pt_log_to("Setting timestamp", decoder->timestamp); } static void intel_pt_calc_cbr(struct intel_pt_decoder *decoder) @@ -1509,6 +1516,8 @@ static void intel_pt_calc_cyc_timestamp(struct intel_pt_decoder *decoder) decoder->timestamp = timestamp; decoder->timestamp_insn_cnt = 0; + + intel_pt_log_to("Setting timestamp", decoder->timestamp); } /* Walk PSB+ packets when already in sync. */ @@ -1640,14 +1649,15 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) case INTEL_PT_TIP_PGD: decoder->state.from_ip = decoder->ip; - decoder->state.to_ip = 0; - if (decoder->packet.count != 0) { + if (decoder->packet.count == 0) { + decoder->state.to_ip = 0; + } else { intel_pt_set_ip(decoder); - intel_pt_log("Omitting PGD ip " x64_fmt "\n", - decoder->ip); + decoder->state.to_ip = decoder->ip; } decoder->pge = false; decoder->continuous_period = false; + decoder->state.type |= INTEL_PT_TRACE_END; return 0; case INTEL_PT_TIP_PGE: @@ -1661,6 +1671,7 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) intel_pt_set_ip(decoder); decoder->state.to_ip = decoder->ip; } + decoder->state.type |= INTEL_PT_TRACE_BEGIN; return 0; case INTEL_PT_TIP: @@ -1739,6 +1750,7 @@ next: intel_pt_set_ip(decoder); decoder->state.from_ip = 0; decoder->state.to_ip = decoder->ip; + decoder->state.type |= INTEL_PT_TRACE_BEGIN; return 0; } @@ -2077,9 +2089,13 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) decoder->pge = decoder->packet.type != INTEL_PT_TIP_PGD; if (intel_pt_have_ip(decoder)) intel_pt_set_ip(decoder); - if (decoder->ip) - return 0; - break; + if (!decoder->ip) + break; + if (decoder->packet.type == INTEL_PT_TIP_PGE) + decoder->state.type |= INTEL_PT_TRACE_BEGIN; + if (decoder->packet.type == INTEL_PT_TIP_PGD) + decoder->state.type |= INTEL_PT_TRACE_END; + return 0; case INTEL_PT_FUP: if (intel_pt_have_ip(decoder)) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index 51c18d67f4ca..ed088d4726ba 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -37,6 +37,8 @@ enum intel_pt_sample_type { INTEL_PT_EX_STOP = 1 << 6, INTEL_PT_PWR_EXIT = 1 << 7, INTEL_PT_CBR_CHG = 1 << 8, + INTEL_PT_TRACE_BEGIN = 1 << 9, + INTEL_PT_TRACE_END = 1 << 10, }; enum intel_pt_period_type { diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.c b/tools/perf/util/intel-pt-decoder/intel-pt-log.c index e02bc7b166a0..5e64da270f97 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.c @@ -31,6 +31,11 @@ static FILE *f; static char log_name[MAX_LOG_NAME]; bool intel_pt_enable_logging; +void *intel_pt_log_fp(void) +{ + return f; +} + void intel_pt_log_enable(void) { intel_pt_enable_logging = true; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h index 45b64f93f358..cc084937f701 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-log.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h @@ -22,6 +22,7 @@ struct intel_pt_pkt; +void *intel_pt_log_fp(void); void intel_pt_log_enable(void); void intel_pt_log_disable(void); void intel_pt_log_set_name(const char *name); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index aec68908d604..149ff361ca78 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -206,6 +206,16 @@ static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf, intel_pt_dump(pt, buf, len); } +static void intel_pt_log_event(union perf_event *event) +{ + FILE *f = intel_pt_log_fp(); + + if (!intel_pt_enable_logging || !f) + return; + + perf_event__fprintf(event, f); +} + static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a, struct auxtrace_buffer *b) { @@ -407,6 +417,13 @@ intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset) return auxtrace_cache__lookup(dso->auxtrace_cache, offset); } +static inline u8 intel_pt_cpumode(struct intel_pt *pt, uint64_t ip) +{ + return ip >= pt->kernel_start ? + PERF_RECORD_MISC_KERNEL : + PERF_RECORD_MISC_USER; +} + static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, uint64_t *insn_cnt_ptr, uint64_t *ip, uint64_t to_ip, uint64_t max_insn_cnt, @@ -429,10 +446,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, if (to_ip && *ip == to_ip) goto out_no_cache; - if (*ip >= ptq->pt->kernel_start) - cpumode = PERF_RECORD_MISC_KERNEL; - else - cpumode = PERF_RECORD_MISC_USER; + cpumode = intel_pt_cpumode(ptq->pt, *ip); thread = ptq->thread; if (!thread) { @@ -759,7 +773,8 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, if (pt->synth_opts.callchain) { size_t sz = sizeof(struct ip_callchain); - sz += pt->synth_opts.callchain_sz * sizeof(u64); + /* Add 1 to callchain_sz for callchain context */ + sz += (pt->synth_opts.callchain_sz + 1) * sizeof(u64); ptq->chain = zalloc(sz); if (!ptq->chain) goto out_free; @@ -908,6 +923,11 @@ static void intel_pt_sample_flags(struct intel_pt_queue *ptq) ptq->insn_len = ptq->state->insn_len; memcpy(ptq->insn, ptq->state->insn, INTEL_PT_INSN_BUF_SZ); } + + if (ptq->state->type & INTEL_PT_TRACE_BEGIN) + ptq->flags |= PERF_IP_FLAG_TRACE_BEGIN; + if (ptq->state->type & INTEL_PT_TRACE_END) + ptq->flags |= PERF_IP_FLAG_TRACE_END; } static int intel_pt_setup_queue(struct intel_pt *pt, @@ -1053,15 +1073,11 @@ static void intel_pt_prep_b_sample(struct intel_pt *pt, union perf_event *event, struct perf_sample *sample) { - event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = PERF_RECORD_MISC_USER; - event->sample.header.size = sizeof(struct perf_event_header); - if (!pt->timeless_decoding) sample->time = tsc_to_perf_time(ptq->timestamp, &pt->tc); - sample->cpumode = PERF_RECORD_MISC_USER; sample->ip = ptq->state->from_ip; + sample->cpumode = intel_pt_cpumode(pt, sample->ip); sample->pid = ptq->pid; sample->tid = ptq->tid; sample->addr = ptq->state->to_ip; @@ -1070,6 +1086,10 @@ static void intel_pt_prep_b_sample(struct intel_pt *pt, sample->flags = ptq->flags; sample->insn_len = ptq->insn_len; memcpy(sample->insn, ptq->insn, INTEL_PT_INSN_BUF_SZ); + + event->sample.header.type = PERF_RECORD_SAMPLE; + event->sample.header.misc = sample->cpumode; + event->sample.header.size = sizeof(struct perf_event_header); } static int intel_pt_inject_event(union perf_event *event, @@ -1155,7 +1175,8 @@ static void intel_pt_prep_sample(struct intel_pt *pt, if (pt->synth_opts.callchain) { thread_stack__sample(ptq->thread, ptq->chain, - pt->synth_opts.callchain_sz, sample->ip); + pt->synth_opts.callchain_sz + 1, + sample->ip, pt->kernel_start); sample->callchain = ptq->chain; } @@ -1999,9 +2020,9 @@ static int intel_pt_process_event(struct perf_session *session, event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) err = intel_pt_context_switch(pt, event, sample); - intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n", - perf_event__name(event->header.type), event->header.type, - sample->cpu, sample->time, timestamp); + intel_pt_log("event %u: cpu %d time %"PRIu64" tsc %#"PRIx64" ", + event->header.type, sample->cpu, sample->time, timestamp); + intel_pt_log_event(event); return err; } @@ -2554,7 +2575,8 @@ int intel_pt_process_auxtrace_info(union perf_event *event, if (session->itrace_synth_opts && session->itrace_synth_opts->set) { pt->synth_opts = *session->itrace_synth_opts; } else { - itrace_synth_opts__set_default(&pt->synth_opts); + itrace_synth_opts__set_default(&pt->synth_opts, + session->itrace_synth_opts->default_no_sample); if (use_browser != -1) { pt->synth_opts.branches = false; pt->synth_opts.callchain = true; diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 19262f98cd4e..5b0b60f00275 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -19,7 +19,7 @@ #define CLANG_BPF_CMD_DEFAULT_TEMPLATE \ "$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\ "-DLINUX_VERSION_CODE=$LINUX_VERSION_CODE " \ - "$CLANG_OPTIONS $KERNEL_INC_OPTIONS $PERF_BPF_INC_OPTIONS " \ + "$CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS " \ "-Wno-unused-value -Wno-pointer-sign " \ "-working-directory $WORKING_DIR " \ "-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE" diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c4acd2001db0..8f36ce813bc5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1708,6 +1708,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event struct thread *parent = machine__findnew_thread(machine, event->fork.ppid, event->fork.ptid); + bool do_maps_clone = true; int err = 0; if (dump_trace) @@ -1736,9 +1737,25 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); + /* + * When synthesizing FORK events, we are trying to create thread + * objects for the already running tasks on the machine. + * + * Normally, for a kernel FORK event, we want to clone the parent's + * maps because that is what the kernel just did. + * + * But when synthesizing, this should not be done. If we do, we end up + * with overlapping maps as we process the sythesized MMAP2 events that + * get delivered shortly thereafter. + * + * Use the FORK event misc flags in an internal way to signal this + * situation, so we can elide the map clone when appropriate. + */ + if (event->fork.header.misc & PERF_RECORD_MISC_FORK_EXEC) + do_maps_clone = false; if (thread == NULL || parent == NULL || - thread__fork(thread, parent, sample->time) < 0) { + thread__fork(thread, parent, sample->time, do_maps_clone) < 0) { dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); err = -1; } @@ -2140,6 +2157,27 @@ static int resolve_lbr_callchain_sample(struct thread *thread, return 0; } +static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, + struct callchain_cursor *cursor, + struct symbol **parent, + struct addr_location *root_al, + u8 *cpumode, int ent) +{ + int err = 0; + + while (--ent >= 0) { + u64 ip = chain->ips[ent]; + + if (ip >= PERF_CONTEXT_MAX) { + err = add_callchain_ip(thread, cursor, parent, + root_al, cpumode, ip, + false, NULL, NULL, 0); + break; + } + } + return err; +} + static int thread__resolve_callchain_sample(struct thread *thread, struct callchain_cursor *cursor, struct perf_evsel *evsel, @@ -2246,6 +2284,12 @@ static int thread__resolve_callchain_sample(struct thread *thread, } check_calls: + if (callchain_param.order != ORDER_CALLEE) { + err = find_prev_cpumode(chain, thread, cursor, parent, root_al, + &cpumode, chain->nr - first_call); + if (err) + return (err < 0) ? err : 0; + } for (i = first_call, nr_entries = 0; i < chain_nr && nr_entries < max_stack; i++) { u64 ip; @@ -2260,9 +2304,15 @@ check_calls: continue; #endif ip = chain->ips[j]; - if (ip < PERF_CONTEXT_MAX) ++nr_entries; + else if (callchain_param.order != ORDER_CALLEE) { + err = find_prev_cpumode(chain, thread, cursor, parent, + root_al, &cpumode, j); + if (err) + return (err < 0) ? err : 0; + continue; + } err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, @@ -2286,7 +2336,8 @@ static int append_inlines(struct callchain_cursor *cursor, if (!symbol_conf.inline_name || !map || !sym) return ret; - addr = map__rip_2objdump(map, ip); + addr = map__map_ip(map, ip); + addr = map__rip_2objdump(map, addr); inline_node = inlines__tree_find(&map->dso->inlined_nodes, addr); if (!inline_node) { @@ -2312,7 +2363,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) { struct callchain_cursor *cursor = arg; const char *srcline = NULL; - u64 addr; + u64 addr = entry->ip; if (symbol_conf.hide_unresolved && entry->sym == NULL) return 0; @@ -2324,7 +2375,8 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) * Convert entry->ip from a virtual address to an offset in * its corresponding binary. */ - addr = map__map_ip(entry->map, entry->ip); + if (entry->map) + addr = map__map_ip(entry->map, entry->ip); srcline = callchain_srcline(entry->map, entry->sym, addr); return callchain_cursor_append(cursor, entry->ip, diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 6a6929f208b4..354e54550d2b 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -320,12 +320,11 @@ int map__load(struct map *map) build_id__sprintf(map->dso->build_id, sizeof(map->dso->build_id), sbuild_id); - pr_warning("%s with build id %s not found", - name, sbuild_id); + pr_debug("%s with build id %s not found", name, sbuild_id); } else - pr_warning("Failed to open %s", name); + pr_debug("Failed to open %s", name); - pr_warning(", continuing without symbols\n"); + pr_debug(", continuing without symbols\n"); return -1; } else if (nr == 0) { #ifdef HAVE_LIBELF_SUPPORT @@ -334,12 +333,11 @@ int map__load(struct map *map) if (len > sizeof(DSO__DELETED) && strcmp(name + real_len + 1, DSO__DELETED) == 0) { - pr_warning("%.*s was updated (is prelink enabled?). " + pr_debug("%.*s was updated (is prelink enabled?). " "Restart the long running apps that use it!\n", (int)real_len, name); } else { - pr_warning("no symbols found in %s, maybe install " - "a debug package?\n", name); + pr_debug("no symbols found in %s, maybe install a debug package?\n", name); } #endif return -1; @@ -712,8 +710,7 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp if (verbose >= 2) { if (use_browser) { - pr_warning("overlapping maps in %s " - "(disable tui for more info)\n", + pr_debug("overlapping maps in %s (disable tui for more info)\n", map->dso->name); } else { fputs("overlapping maps:\n", fp); diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 215f69f41672..cdb95b3a1213 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -281,7 +281,7 @@ int perf_mmap__read_init(struct perf_mmap *map) } int perf_mmap__push(struct perf_mmap *md, void *to, - int push(void *to, void *buf, size_t size)) + int push(struct perf_mmap *map, void *to, void *buf, size_t size)) { u64 head = perf_mmap__read_head(md); unsigned char *data = md->base + page_size; @@ -300,7 +300,7 @@ int perf_mmap__push(struct perf_mmap *md, void *to, size = md->mask + 1 - (md->start & md->mask); md->start += size; - if (push(to, buf, size) < 0) { + if (push(md, to, buf, size) < 0) { rc = -1; goto out; } @@ -310,7 +310,7 @@ int perf_mmap__push(struct perf_mmap *md, void *to, size = md->end - md->start; md->start += size; - if (push(to, buf, size) < 0) { + if (push(md, to, buf, size) < 0) { rc = -1; goto out; } diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 05a6d47c7956..cc5e2d6d17a9 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include "auxtrace.h" #include "event.h" @@ -71,21 +71,12 @@ void perf_mmap__consume(struct perf_mmap *map); static inline u64 perf_mmap__read_head(struct perf_mmap *mm) { - struct perf_event_mmap_page *pc = mm->base; - u64 head = READ_ONCE(pc->data_head); - rmb(); - return head; + return ring_buffer_read_head(mm->base); } static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) { - struct perf_event_mmap_page *pc = md->base; - - /* - * ensure all reads are done before we write the tail out. - */ - mb(); - pc->data_tail = tail; + ring_buffer_write_tail(md->base, tail); } union perf_event *perf_mmap__read_forward(struct perf_mmap *map); @@ -93,7 +84,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map); union perf_event *perf_mmap__read_event(struct perf_mmap *map); int perf_mmap__push(struct perf_mmap *md, void *to, - int push(void *to, void *buf, size_t size)); + int push(struct perf_mmap *map, void *to, void *buf, size_t size)); size_t perf_mmap__mmap_len(struct perf_mmap *map); diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index bad9e0296e9a..1904e7f6ec84 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -80,14 +80,20 @@ static union perf_event *dup_event(struct ordered_events *oe, return oe->copy_on_queue ? __dup_event(oe, event) : event; } -static void free_dup_event(struct ordered_events *oe, union perf_event *event) +static void __free_dup_event(struct ordered_events *oe, union perf_event *event) { - if (event && oe->copy_on_queue) { + if (event) { oe->cur_alloc_size -= event->header.size; free(event); } } +static void free_dup_event(struct ordered_events *oe, union perf_event *event) +{ + if (oe->copy_on_queue) + __free_dup_event(oe, event); +} + #define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct ordered_event)) static struct ordered_event *alloc_event(struct ordered_events *oe, union perf_event *event) @@ -95,21 +101,49 @@ static struct ordered_event *alloc_event(struct ordered_events *oe, struct list_head *cache = &oe->cache; struct ordered_event *new = NULL; union perf_event *new_event; + size_t size; new_event = dup_event(oe, event); if (!new_event) return NULL; + /* + * We maintain the following scheme of buffers for ordered + * event allocation: + * + * to_free list -> buffer1 (64K) + * buffer2 (64K) + * ... + * + * Each buffer keeps an array of ordered events objects: + * buffer -> event[0] + * event[1] + * ... + * + * Each allocated ordered event is linked to one of + * following lists: + * - time ordered list 'events' + * - list of currently removed events 'cache' + * + * Allocation of the ordered event uses the following order + * to get the memory: + * - use recently removed object from 'cache' list + * - use available object in current allocation buffer + * - allocate new buffer if the current buffer is full + * + * Removal of ordered event object moves it from events to + * the cache list. + */ + size = sizeof(*oe->buffer) + MAX_SAMPLE_BUFFER * sizeof(*new); + if (!list_empty(cache)) { new = list_entry(cache->next, struct ordered_event, list); list_del(&new->list); } else if (oe->buffer) { - new = oe->buffer + oe->buffer_idx; + new = &oe->buffer->event[oe->buffer_idx]; if (++oe->buffer_idx == MAX_SAMPLE_BUFFER) oe->buffer = NULL; - } else if (oe->cur_alloc_size < oe->max_alloc_size) { - size_t size = MAX_SAMPLE_BUFFER * sizeof(*new); - + } else if ((oe->cur_alloc_size + size) < oe->max_alloc_size) { oe->buffer = malloc(size); if (!oe->buffer) { free_dup_event(oe, new_event); @@ -122,11 +156,11 @@ static struct ordered_event *alloc_event(struct ordered_events *oe, oe->cur_alloc_size += size; list_add(&oe->buffer->list, &oe->to_free); - /* First entry is abused to maintain the to_free list. */ - oe->buffer_idx = 2; - new = oe->buffer + 1; + oe->buffer_idx = 1; + new = &oe->buffer->event[0]; } else { pr("allocation limit reached %" PRIu64 "B\n", oe->max_alloc_size); + return NULL; } new->event = new_event; @@ -300,15 +334,38 @@ void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t d oe->deliver = deliver; } +static void +ordered_events_buffer__free(struct ordered_events_buffer *buffer, + unsigned int max, struct ordered_events *oe) +{ + if (oe->copy_on_queue) { + unsigned int i; + + for (i = 0; i < max; i++) + __free_dup_event(oe, buffer->event[i].event); + } + + free(buffer); +} + void ordered_events__free(struct ordered_events *oe) { - while (!list_empty(&oe->to_free)) { - struct ordered_event *event; + struct ordered_events_buffer *buffer, *tmp; - event = list_entry(oe->to_free.next, struct ordered_event, list); - list_del(&event->list); - free_dup_event(oe, event->event); - free(event); + if (list_empty(&oe->to_free)) + return; + + /* + * Current buffer might not have all the events allocated + * yet, we need to free only allocated ones ... + */ + list_del(&oe->buffer->list); + ordered_events_buffer__free(oe->buffer, oe->buffer_idx, oe); + + /* ... and continue with the rest */ + list_for_each_entry_safe(buffer, tmp, &oe->to_free, list) { + list_del(&buffer->list); + ordered_events_buffer__free(buffer, MAX_SAMPLE_BUFFER, oe); } } diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h index 8c7a2948593e..1338d5c345dc 100644 --- a/tools/perf/util/ordered-events.h +++ b/tools/perf/util/ordered-events.h @@ -25,23 +25,28 @@ struct ordered_events; typedef int (*ordered_events__deliver_t)(struct ordered_events *oe, struct ordered_event *event); +struct ordered_events_buffer { + struct list_head list; + struct ordered_event event[0]; +}; + struct ordered_events { - u64 last_flush; - u64 next_flush; - u64 max_timestamp; - u64 max_alloc_size; - u64 cur_alloc_size; - struct list_head events; - struct list_head cache; - struct list_head to_free; - struct ordered_event *buffer; - struct ordered_event *last; - ordered_events__deliver_t deliver; - int buffer_idx; - unsigned int nr_events; - enum oe_flush last_flush_type; - u32 nr_unordered_events; - bool copy_on_queue; + u64 last_flush; + u64 next_flush; + u64 max_timestamp; + u64 max_alloc_size; + u64 cur_alloc_size; + struct list_head events; + struct list_head cache; + struct list_head to_free; + struct ordered_events_buffer *buffer; + struct ordered_event *last; + ordered_events__deliver_t deliver; + int buffer_idx; + unsigned int nr_events; + enum oe_flush last_flush_type; + u32 nr_unordered_events; + bool copy_on_queue; }; int ordered_events__queue(struct ordered_events *oe, union perf_event *event, diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f8cd3e7c9186..59be3466d64d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -926,6 +926,7 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_NOINHERIT] = "no-inherit", [PARSE_EVENTS__TERM_TYPE_INHERIT] = "inherit", [PARSE_EVENTS__TERM_TYPE_MAX_STACK] = "max-stack", + [PARSE_EVENTS__TERM_TYPE_MAX_EVENTS] = "nr", [PARSE_EVENTS__TERM_TYPE_OVERWRITE] = "overwrite", [PARSE_EVENTS__TERM_TYPE_NOOVERWRITE] = "no-overwrite", [PARSE_EVENTS__TERM_TYPE_DRV_CFG] = "driver-config", @@ -1037,6 +1038,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_MAX_STACK: CHECK_TYPE_VAL(NUM); break; + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: + CHECK_TYPE_VAL(NUM); + break; default: err->str = strdup("unknown term"); err->idx = term->err_term; @@ -1084,6 +1088,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr, case PARSE_EVENTS__TERM_TYPE_INHERIT: case PARSE_EVENTS__TERM_TYPE_NOINHERIT: case PARSE_EVENTS__TERM_TYPE_MAX_STACK: + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: case PARSE_EVENTS__TERM_TYPE_OVERWRITE: case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: return config_term_common(attr, term, err); @@ -1162,6 +1167,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_MAX_STACK: ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num); break; + case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: + ADD_CONFIG_TERM(MAX_EVENTS, max_events, term->val.num); + break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0); break; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 4473dac27aee..5ed035cbcbb7 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -71,6 +71,7 @@ enum { PARSE_EVENTS__TERM_TYPE_NOINHERIT, PARSE_EVENTS__TERM_TYPE_INHERIT, PARSE_EVENTS__TERM_TYPE_MAX_STACK, + PARSE_EVENTS__TERM_TYPE_MAX_EVENTS, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE, PARSE_EVENTS__TERM_TYPE_OVERWRITE, PARSE_EVENTS__TERM_TYPE_DRV_CFG, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 5f761f3ed0f3..7805c71aaae2 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -269,6 +269,7 @@ time { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); } call-graph { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); } stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); } max-stack { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_STACK); } +nr { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_EVENTS); } inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); } no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); } overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index afd68524ffa9..7e49baad304d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -773,7 +773,7 @@ static void pmu_add_cpu_aliases(struct list_head *head, struct perf_pmu *pmu) if (!is_arm_pmu_core(name)) { pname = pe->pmu ? pe->pmu : "cpu"; - if (strncmp(pname, name, strlen(pname))) + if (strcmp(pname, name)) continue; } @@ -930,13 +930,14 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, static __u64 pmu_format_max_value(const unsigned long *format) { - __u64 w = 0; - int fbit; - - for_each_set_bit(fbit, format, PERF_PMU_FORMAT_BITS) - w |= (1ULL << fbit); + int w; - return w; + w = bitmap_weight(format, PERF_PMU_FORMAT_BITS); + if (!w) + return 0; + if (w < 64) + return (1ULL << w) - 1; + return -1; } /* diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index f119eb628dbb..e86f8be89157 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1819,6 +1819,12 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev) tp->offset = strtoul(fmt2_str, NULL, 10); } + if (tev->uprobes) { + fmt2_str = strchr(p, '('); + if (fmt2_str) + tp->ref_ctr_offset = strtoul(fmt2_str + 1, NULL, 0); + } + tev->nargs = argc - 2; tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs); if (tev->args == NULL) { @@ -2012,6 +2018,22 @@ static int synthesize_probe_trace_arg(struct probe_trace_arg *arg, return err; } +static int +synthesize_uprobe_trace_def(struct probe_trace_event *tev, struct strbuf *buf) +{ + struct probe_trace_point *tp = &tev->point; + int err; + + err = strbuf_addf(buf, "%s:0x%lx", tp->module, tp->address); + + if (err >= 0 && tp->ref_ctr_offset) { + if (!uprobe_ref_ctr_is_supported()) + return -1; + err = strbuf_addf(buf, "(0x%lx)", tp->ref_ctr_offset); + } + return err >= 0 ? 0 : -1; +} + char *synthesize_probe_trace_command(struct probe_trace_event *tev) { struct probe_trace_point *tp = &tev->point; @@ -2041,15 +2063,17 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev) } /* Use the tp->address for uprobes */ - if (tev->uprobes) - err = strbuf_addf(&buf, "%s:0x%lx", tp->module, tp->address); - else if (!strncmp(tp->symbol, "0x", 2)) + if (tev->uprobes) { + err = synthesize_uprobe_trace_def(tev, &buf); + } else if (!strncmp(tp->symbol, "0x", 2)) { /* Absolute address. See try_to_find_absolute_address() */ err = strbuf_addf(&buf, "%s%s0x%lx", tp->module ?: "", tp->module ? ":" : "", tp->address); - else + } else { err = strbuf_addf(&buf, "%s%s%s+%lu", tp->module ?: "", tp->module ? ":" : "", tp->symbol, tp->offset); + } + if (err) goto error; @@ -2633,6 +2657,13 @@ static void warn_uprobe_event_compat(struct probe_trace_event *tev) { int i; char *buf = synthesize_probe_trace_command(tev); + struct probe_trace_point *tp = &tev->point; + + if (tp->ref_ctr_offset && !uprobe_ref_ctr_is_supported()) { + pr_warning("A semaphore is associated with %s:%s and " + "seems your kernel doesn't support it.\n", + tev->group, tev->event); + } /* Old uprobe event doesn't support memory dereference */ if (!tev->uprobes || tev->nargs == 0 || !buf) diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 45b14f020558..15a98c3a2a2f 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -27,6 +27,7 @@ struct probe_trace_point { char *symbol; /* Base symbol */ char *module; /* Module name */ unsigned long offset; /* Offset from symbol */ + unsigned long ref_ctr_offset; /* SDT reference counter offset */ unsigned long address; /* Actual address of the trace point */ bool retprobe; /* Return probe flag */ }; diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index b76088fadf3d..aac7817d9e14 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -696,8 +696,16 @@ out_err: #ifdef HAVE_GELF_GETNOTE_SUPPORT static unsigned long long sdt_note__get_addr(struct sdt_note *note) { - return note->bit32 ? (unsigned long long)note->addr.a32[0] - : (unsigned long long)note->addr.a64[0]; + return note->bit32 ? + (unsigned long long)note->addr.a32[SDT_NOTE_IDX_LOC] : + (unsigned long long)note->addr.a64[SDT_NOTE_IDX_LOC]; +} + +static unsigned long long sdt_note__get_ref_ctr_offset(struct sdt_note *note) +{ + return note->bit32 ? + (unsigned long long)note->addr.a32[SDT_NOTE_IDX_REFCTR] : + (unsigned long long)note->addr.a64[SDT_NOTE_IDX_REFCTR]; } static const char * const type_to_suffix[] = { @@ -775,14 +783,21 @@ static char *synthesize_sdt_probe_command(struct sdt_note *note, { struct strbuf buf; char *ret = NULL, **args; - int i, args_count; + int i, args_count, err; + unsigned long long ref_ctr_offset; if (strbuf_init(&buf, 32) < 0) return NULL; - if (strbuf_addf(&buf, "p:%s/%s %s:0x%llx", - sdtgrp, note->name, pathname, - sdt_note__get_addr(note)) < 0) + err = strbuf_addf(&buf, "p:%s/%s %s:0x%llx", + sdtgrp, note->name, pathname, + sdt_note__get_addr(note)); + + ref_ctr_offset = sdt_note__get_ref_ctr_offset(note); + if (ref_ctr_offset && err >= 0) + err = strbuf_addf(&buf, "(0x%llx)", ref_ctr_offset); + + if (err < 0) goto error; if (!note->args) @@ -998,6 +1013,7 @@ int probe_cache__show_all_caches(struct strfilter *filter) enum ftrace_readme { FTRACE_README_PROBE_TYPE_X = 0, FTRACE_README_KRETPROBE_OFFSET, + FTRACE_README_UPROBE_REF_CTR, FTRACE_README_END, }; @@ -1009,6 +1025,7 @@ static struct { [idx] = {.pattern = pat, .avail = false} DEFINE_TYPE(FTRACE_README_PROBE_TYPE_X, "*type: * x8/16/32/64,*"), DEFINE_TYPE(FTRACE_README_KRETPROBE_OFFSET, "*place (kretprobe): *"), + DEFINE_TYPE(FTRACE_README_UPROBE_REF_CTR, "*ref_ctr_offset*"), }; static bool scan_ftrace_readme(enum ftrace_readme type) @@ -1064,3 +1081,8 @@ bool kretprobe_offset_is_supported(void) { return scan_ftrace_readme(FTRACE_README_KRETPROBE_OFFSET); } + +bool uprobe_ref_ctr_is_supported(void) +{ + return scan_ftrace_readme(FTRACE_README_UPROBE_REF_CTR); +} diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h index 63f29b1d22c1..2a249182f2a6 100644 --- a/tools/perf/util/probe-file.h +++ b/tools/perf/util/probe-file.h @@ -69,6 +69,7 @@ struct probe_cache_entry *probe_cache__find_by_name(struct probe_cache *pcache, int probe_cache__show_all_caches(struct strfilter *filter); bool probe_type_is_available(enum probe_type type); bool kretprobe_offset_is_supported(void); +bool uprobe_ref_ctr_is_supported(void); #else /* ! HAVE_LIBELF_SUPPORT */ static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused, struct nsinfo *nsi __maybe_unused) { diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index ce501ba14b08..50150dfc0cdf 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -340,7 +340,7 @@ static bool is_tracepoint(struct pyrf_event *pevent) } static PyObject* -tracepoint_field(struct pyrf_event *pe, struct format_field *field) +tracepoint_field(struct pyrf_event *pe, struct tep_format_field *field) { struct tep_handle *pevent = field->event->pevent; void *data = pe->sample.raw_data; @@ -348,28 +348,28 @@ tracepoint_field(struct pyrf_event *pe, struct format_field *field) unsigned long long val; unsigned int offset, len; - if (field->flags & FIELD_IS_ARRAY) { + if (field->flags & TEP_FIELD_IS_ARRAY) { offset = field->offset; len = field->size; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { val = tep_read_number(pevent, data + offset, len); offset = val; len = offset >> 16; offset &= 0xffff; } - if (field->flags & FIELD_IS_STRING && + if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { ret = _PyUnicode_FromString((char *)data + offset); } else { ret = PyByteArray_FromStringAndSize((const char *) data + offset, len); - field->flags &= ~FIELD_IS_STRING; + field->flags &= ~TEP_FIELD_IS_STRING; } } else { val = tep_read_number(pevent, data + field->offset, field->size); - if (field->flags & FIELD_IS_POINTER) + if (field->flags & TEP_FIELD_IS_POINTER) ret = PyLong_FromUnsignedLong((unsigned long) val); - else if (field->flags & FIELD_IS_SIGNED) + else if (field->flags & TEP_FIELD_IS_SIGNED) ret = PyLong_FromLong((long) val); else ret = PyLong_FromUnsignedLong((unsigned long) val); @@ -383,10 +383,10 @@ get_tracepoint_field(struct pyrf_event *pevent, PyObject *attr_name) { const char *str = _PyUnicode_AsString(PyObject_Str(attr_name)); struct perf_evsel *evsel = pevent->evsel; - struct format_field *field; + struct tep_format_field *field; if (!evsel->tp_format) { - struct event_format *tp_format; + struct tep_event_format *tp_format; tp_format = trace_event__tp_format_id(evsel->attr.config); if (!tp_format) @@ -1240,7 +1240,7 @@ static struct { static PyObject *pyrf__tracepoint(struct pyrf_evsel *pevsel, PyObject *args, PyObject *kwargs) { - struct event_format *tp_format; + struct tep_event_format *tp_format; static char *kwlist[] = { "sys", "name", NULL }; char *sys = NULL; char *name = NULL; diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index d2c78ffd9fee..a2eeebbfb25f 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -147,6 +147,9 @@ #include #include +#include +#include + #include "cpumap.h" #include "color.h" #include "evsel.h" @@ -159,6 +162,7 @@ #include "auxtrace.h" #include "s390-cpumsf.h" #include "s390-cpumsf-kernel.h" +#include "config.h" struct s390_cpumsf { struct auxtrace auxtrace; @@ -170,6 +174,8 @@ struct s390_cpumsf { u32 pmu_type; u16 machine_type; bool data_queued; + bool use_logfile; + char *logdir; }; struct s390_cpumsf_queue { @@ -177,6 +183,7 @@ struct s390_cpumsf_queue { unsigned int queue_nr; struct auxtrace_buffer *buffer; int cpu; + FILE *logfile; }; /* Display s390 CPU measurement facility basic-sampling data entry */ @@ -595,6 +602,12 @@ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq, buffer->use_size = buffer->size; buffer->use_data = buffer->data; } + if (sfq->logfile) { /* Write into log file */ + size_t rc = fwrite(buffer->data, buffer->size, 1, + sfq->logfile); + if (rc != 1) + pr_err("Failed to write auxiliary data\n"); + } } else buffer = sfq->buffer; @@ -606,6 +619,13 @@ static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq, return -ENOMEM; buffer->use_size = buffer->size; buffer->use_data = buffer->data; + + if (sfq->logfile) { /* Write into log file */ + size_t rc = fwrite(buffer->data, buffer->size, 1, + sfq->logfile); + if (rc != 1) + pr_err("Failed to write auxiliary data\n"); + } } pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n", __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset, @@ -640,6 +660,23 @@ s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr) sfq->sf = sf; sfq->queue_nr = queue_nr; sfq->cpu = -1; + if (sf->use_logfile) { + char *name; + int rc; + + rc = (sf->logdir) + ? asprintf(&name, "%s/aux.smp.%02x", + sf->logdir, queue_nr) + : asprintf(&name, "aux.smp.%02x", queue_nr); + if (rc > 0) + sfq->logfile = fopen(name, "w"); + if (sfq->logfile == NULL) { + pr_err("Failed to open auxiliary log file %s," + "continue...\n", name); + sf->use_logfile = false; + } + free(name); + } return sfq; } @@ -850,8 +887,16 @@ static void s390_cpumsf_free_queues(struct perf_session *session) struct auxtrace_queues *queues = &sf->queues; unsigned int i; - for (i = 0; i < queues->nr_queues; i++) + for (i = 0; i < queues->nr_queues; i++) { + struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *) + queues->queue_array[i].priv; + + if (sfq != NULL && sfq->logfile) { + fclose(sfq->logfile); + sfq->logfile = NULL; + } zfree(&queues->queue_array[i].priv); + } auxtrace_queues__free(queues); } @@ -864,6 +909,7 @@ static void s390_cpumsf_free(struct perf_session *session) auxtrace_heap__free(&sf->heap); s390_cpumsf_free_queues(session); session->auxtrace = NULL; + free(sf->logdir); free(sf); } @@ -877,17 +923,55 @@ static int s390_cpumsf_get_type(const char *cpuid) /* Check itrace options set on perf report command. * Return true, if none are set or all options specified can be - * handled on s390. + * handled on s390 (currently only option 'd' for logging. * Return false otherwise. */ static bool check_auxtrace_itrace(struct itrace_synth_opts *itops) { + bool ison = false; + if (!itops || !itops->set) return true; - pr_err("No --itrace options supported\n"); + ison = itops->inject || itops->instructions || itops->branches || + itops->transactions || itops->ptwrites || + itops->pwr_events || itops->errors || + itops->dont_decode || itops->calls || itops->returns || + itops->callchain || itops->thread_stack || + itops->last_branch; + if (!ison) + return true; + pr_err("Unsupported --itrace options specified\n"); return false; } +/* Check for AUXTRACE dump directory if it is needed. + * On failure print an error message but continue. + * Return 0 on wrong keyword in config file and 1 otherwise. + */ +static int s390_cpumsf__config(const char *var, const char *value, void *cb) +{ + struct s390_cpumsf *sf = cb; + struct stat stbuf; + int rc; + + if (strcmp(var, "auxtrace.dumpdir")) + return 0; + sf->logdir = strdup(value); + if (sf->logdir == NULL) { + pr_err("Failed to find auxtrace log directory %s," + " continue with current directory...\n", value); + return 1; + } + rc = stat(sf->logdir, &stbuf); + if (rc == -1 || !S_ISDIR(stbuf.st_mode)) { + pr_err("Missing auxtrace log directory %s," + " continue with current directory...\n", value); + free(sf->logdir); + sf->logdir = NULL; + } + return 1; +} + int s390_cpumsf_process_auxtrace_info(union perf_event *event, struct perf_session *session) { @@ -906,6 +990,9 @@ int s390_cpumsf_process_auxtrace_info(union perf_event *event, err = -EINVAL; goto err_free; } + sf->use_logfile = session->itrace_synth_opts->log; + if (sf->use_logfile) + perf_config(s390_cpumsf__config, sf); err = auxtrace_queues__init(&sf->queues); if (err) @@ -940,6 +1027,7 @@ err_free_queues: auxtrace_queues__free(&sf->queues); session->auxtrace = NULL; err_free: + free(sf->logdir); free(sf); return err; } diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 45484f0f7292..89cb887648f9 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -99,7 +99,7 @@ static void define_symbolic_value(const char *ev_name, LEAVE; } -static void define_symbolic_values(struct print_flag_sym *field, +static void define_symbolic_values(struct tep_print_flag_sym *field, const char *ev_name, const char *field_name) { @@ -157,7 +157,7 @@ static void define_flag_value(const char *ev_name, LEAVE; } -static void define_flag_values(struct print_flag_sym *field, +static void define_flag_values(struct tep_print_flag_sym *field, const char *ev_name, const char *field_name) { @@ -189,62 +189,62 @@ static void define_flag_field(const char *ev_name, LEAVE; } -static void define_event_symbols(struct event_format *event, +static void define_event_symbols(struct tep_event_format *event, const char *ev_name, - struct print_arg *args) + struct tep_print_arg *args) { if (args == NULL) return; switch (args->type) { - case PRINT_NULL: + case TEP_PRINT_NULL: break; - case PRINT_ATOM: + case TEP_PRINT_ATOM: define_flag_value(ev_name, cur_field_name, "0", args->atom.atom); zero_flag_atom = 0; break; - case PRINT_FIELD: + case TEP_PRINT_FIELD: free(cur_field_name); cur_field_name = strdup(args->field.name); break; - case PRINT_FLAGS: + case TEP_PRINT_FLAGS: define_event_symbols(event, ev_name, args->flags.field); define_flag_field(ev_name, cur_field_name, args->flags.delim); define_flag_values(args->flags.flags, ev_name, cur_field_name); break; - case PRINT_SYMBOL: + case TEP_PRINT_SYMBOL: define_event_symbols(event, ev_name, args->symbol.field); define_symbolic_field(ev_name, cur_field_name); define_symbolic_values(args->symbol.symbols, ev_name, cur_field_name); break; - case PRINT_HEX: - case PRINT_HEX_STR: + case TEP_PRINT_HEX: + case TEP_PRINT_HEX_STR: define_event_symbols(event, ev_name, args->hex.field); define_event_symbols(event, ev_name, args->hex.size); break; - case PRINT_INT_ARRAY: + case TEP_PRINT_INT_ARRAY: define_event_symbols(event, ev_name, args->int_array.field); define_event_symbols(event, ev_name, args->int_array.count); define_event_symbols(event, ev_name, args->int_array.el_size); break; - case PRINT_BSTRING: - case PRINT_DYNAMIC_ARRAY: - case PRINT_DYNAMIC_ARRAY_LEN: - case PRINT_STRING: - case PRINT_BITMASK: + case TEP_PRINT_BSTRING: + case TEP_PRINT_DYNAMIC_ARRAY: + case TEP_PRINT_DYNAMIC_ARRAY_LEN: + case TEP_PRINT_STRING: + case TEP_PRINT_BITMASK: break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: define_event_symbols(event, ev_name, args->typecast.item); break; - case PRINT_OP: + case TEP_PRINT_OP: if (strcmp(args->op.op, ":") == 0) zero_flag_atom = 1; define_event_symbols(event, ev_name, args->op.left); define_event_symbols(event, ev_name, args->op.right); break; - case PRINT_FUNC: + case TEP_PRINT_FUNC: default: pr_err("Unsupported print arg type\n"); /* we should warn... */ @@ -338,8 +338,8 @@ static void perl_process_tracepoint(struct perf_sample *sample, struct addr_location *al) { struct thread *thread = al->thread; - struct event_format *event = evsel->tp_format; - struct format_field *field; + struct tep_event_format *event = evsel->tp_format; + struct tep_format_field *field; static char handler[256]; unsigned long long val; unsigned long s, ns; @@ -388,9 +388,9 @@ static void perl_process_tracepoint(struct perf_sample *sample, /* common fields other than pid can be accessed via xsub fns */ for (field = event->format.fields; field; field = field->next) { - if (field->flags & FIELD_IS_STRING) { + if (field->flags & TEP_FIELD_IS_STRING) { int offset; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { offset = *(int *)(data + field->offset); offset &= 0xffff; } else @@ -399,7 +399,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, } else { /* FIELD_IS_NUMERIC */ val = read_size(event, data + field->offset, field->size); - if (field->flags & FIELD_IS_SIGNED) { + if (field->flags & TEP_FIELD_IS_SIGNED) { XPUSHs(sv_2mortal(newSViv(val))); } else { XPUSHs(sv_2mortal(newSVuv(val))); @@ -537,8 +537,8 @@ static int perl_stop_script(void) static int perl_generate_script(struct tep_handle *pevent, const char *outfile) { - struct event_format *event = NULL; - struct format_field *f; + struct tep_event_format *event = NULL; + struct tep_format_field *f; char fname[PATH_MAX]; int not_first, count; FILE *ofp; @@ -646,11 +646,11 @@ sub print_backtrace\n\ count++; fprintf(ofp, "%s=", f->name); - if (f->flags & FIELD_IS_STRING || - f->flags & FIELD_IS_FLAG || - f->flags & FIELD_IS_SYMBOLIC) + if (f->flags & TEP_FIELD_IS_STRING || + f->flags & TEP_FIELD_IS_FLAG || + f->flags & TEP_FIELD_IS_SYMBOLIC) fprintf(ofp, "%%s"); - else if (f->flags & FIELD_IS_SIGNED) + else if (f->flags & TEP_FIELD_IS_SIGNED) fprintf(ofp, "%%d"); else fprintf(ofp, "%%u"); @@ -668,7 +668,7 @@ sub print_backtrace\n\ if (++count % 5 == 0) fprintf(ofp, "\n\t "); - if (f->flags & FIELD_IS_FLAG) { + if (f->flags & TEP_FIELD_IS_FLAG) { if ((count - 1) % 5 != 0) { fprintf(ofp, "\n\t "); count = 4; @@ -678,7 +678,7 @@ sub print_backtrace\n\ event->name); fprintf(ofp, "\"%s\", $%s)", f->name, f->name); - } else if (f->flags & FIELD_IS_SYMBOLIC) { + } else if (f->flags & TEP_FIELD_IS_SYMBOLIC) { if ((count - 1) % 5 != 0) { fprintf(ofp, "\n\t "); count = 4; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index dfc6093f118c..69aa93d4ee99 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -193,7 +193,7 @@ static void try_call_object(const char *handler_name, PyObject *args) call_object(handler, args, handler_name); } -static void define_value(enum print_arg_type field_type, +static void define_value(enum tep_print_arg_type field_type, const char *ev_name, const char *field_name, const char *field_value, @@ -204,7 +204,7 @@ static void define_value(enum print_arg_type field_type, unsigned long long value; unsigned n = 0; - if (field_type == PRINT_SYMBOL) + if (field_type == TEP_PRINT_SYMBOL) handler_name = "define_symbolic_value"; t = PyTuple_New(4); @@ -223,8 +223,8 @@ static void define_value(enum print_arg_type field_type, Py_DECREF(t); } -static void define_values(enum print_arg_type field_type, - struct print_flag_sym *field, +static void define_values(enum tep_print_arg_type field_type, + struct tep_print_flag_sym *field, const char *ev_name, const char *field_name) { @@ -235,7 +235,7 @@ static void define_values(enum print_arg_type field_type, define_values(field_type, field->next, ev_name, field_name); } -static void define_field(enum print_arg_type field_type, +static void define_field(enum tep_print_arg_type field_type, const char *ev_name, const char *field_name, const char *delim) @@ -244,10 +244,10 @@ static void define_field(enum print_arg_type field_type, PyObject *t; unsigned n = 0; - if (field_type == PRINT_SYMBOL) + if (field_type == TEP_PRINT_SYMBOL) handler_name = "define_symbolic_field"; - if (field_type == PRINT_FLAGS) + if (field_type == TEP_PRINT_FLAGS) t = PyTuple_New(3); else t = PyTuple_New(2); @@ -256,7 +256,7 @@ static void define_field(enum print_arg_type field_type, PyTuple_SetItem(t, n++, _PyUnicode_FromString(ev_name)); PyTuple_SetItem(t, n++, _PyUnicode_FromString(field_name)); - if (field_type == PRINT_FLAGS) + if (field_type == TEP_PRINT_FLAGS) PyTuple_SetItem(t, n++, _PyUnicode_FromString(delim)); try_call_object(handler_name, t); @@ -264,54 +264,54 @@ static void define_field(enum print_arg_type field_type, Py_DECREF(t); } -static void define_event_symbols(struct event_format *event, +static void define_event_symbols(struct tep_event_format *event, const char *ev_name, - struct print_arg *args) + struct tep_print_arg *args) { if (args == NULL) return; switch (args->type) { - case PRINT_NULL: + case TEP_PRINT_NULL: break; - case PRINT_ATOM: - define_value(PRINT_FLAGS, ev_name, cur_field_name, "0", + case TEP_PRINT_ATOM: + define_value(TEP_PRINT_FLAGS, ev_name, cur_field_name, "0", args->atom.atom); zero_flag_atom = 0; break; - case PRINT_FIELD: + case TEP_PRINT_FIELD: free(cur_field_name); cur_field_name = strdup(args->field.name); break; - case PRINT_FLAGS: + case TEP_PRINT_FLAGS: define_event_symbols(event, ev_name, args->flags.field); - define_field(PRINT_FLAGS, ev_name, cur_field_name, + define_field(TEP_PRINT_FLAGS, ev_name, cur_field_name, args->flags.delim); - define_values(PRINT_FLAGS, args->flags.flags, ev_name, + define_values(TEP_PRINT_FLAGS, args->flags.flags, ev_name, cur_field_name); break; - case PRINT_SYMBOL: + case TEP_PRINT_SYMBOL: define_event_symbols(event, ev_name, args->symbol.field); - define_field(PRINT_SYMBOL, ev_name, cur_field_name, NULL); - define_values(PRINT_SYMBOL, args->symbol.symbols, ev_name, + define_field(TEP_PRINT_SYMBOL, ev_name, cur_field_name, NULL); + define_values(TEP_PRINT_SYMBOL, args->symbol.symbols, ev_name, cur_field_name); break; - case PRINT_HEX: - case PRINT_HEX_STR: + case TEP_PRINT_HEX: + case TEP_PRINT_HEX_STR: define_event_symbols(event, ev_name, args->hex.field); define_event_symbols(event, ev_name, args->hex.size); break; - case PRINT_INT_ARRAY: + case TEP_PRINT_INT_ARRAY: define_event_symbols(event, ev_name, args->int_array.field); define_event_symbols(event, ev_name, args->int_array.count); define_event_symbols(event, ev_name, args->int_array.el_size); break; - case PRINT_STRING: + case TEP_PRINT_STRING: break; - case PRINT_TYPE: + case TEP_PRINT_TYPE: define_event_symbols(event, ev_name, args->typecast.item); break; - case PRINT_OP: + case TEP_PRINT_OP: if (strcmp(args->op.op, ":") == 0) zero_flag_atom = 1; define_event_symbols(event, ev_name, args->op.left); @@ -319,11 +319,11 @@ static void define_event_symbols(struct event_format *event, break; default: /* gcc warns for these? */ - case PRINT_BSTRING: - case PRINT_DYNAMIC_ARRAY: - case PRINT_DYNAMIC_ARRAY_LEN: - case PRINT_FUNC: - case PRINT_BITMASK: + case TEP_PRINT_BSTRING: + case TEP_PRINT_DYNAMIC_ARRAY: + case TEP_PRINT_DYNAMIC_ARRAY_LEN: + case TEP_PRINT_FUNC: + case TEP_PRINT_BITMASK: /* we should warn... */ return; } @@ -332,10 +332,10 @@ static void define_event_symbols(struct event_format *event, define_event_symbols(event, ev_name, args->next); } -static PyObject *get_field_numeric_entry(struct event_format *event, - struct format_field *field, void *data) +static PyObject *get_field_numeric_entry(struct tep_event_format *event, + struct tep_format_field *field, void *data) { - bool is_array = field->flags & FIELD_IS_ARRAY; + bool is_array = field->flags & TEP_FIELD_IS_ARRAY; PyObject *obj = NULL, *list = NULL; unsigned long long val; unsigned int item_size, n_items, i; @@ -353,7 +353,7 @@ static PyObject *get_field_numeric_entry(struct event_format *event, val = read_size(event, data + field->offset + i * item_size, item_size); - if (field->flags & FIELD_IS_SIGNED) { + if (field->flags & TEP_FIELD_IS_SIGNED) { if ((long long)val >= LONG_MIN && (long long)val <= LONG_MAX) obj = _PyLong_FromLong(val); @@ -790,11 +790,11 @@ static void python_process_tracepoint(struct perf_sample *sample, struct perf_evsel *evsel, struct addr_location *al) { - struct event_format *event = evsel->tp_format; + struct tep_event_format *event = evsel->tp_format; PyObject *handler, *context, *t, *obj = NULL, *callchain; PyObject *dict = NULL, *all_entries_dict = NULL; static char handler_name[256]; - struct format_field *field; + struct tep_format_field *field; unsigned long s, ns; unsigned n = 0; int pid; @@ -867,22 +867,22 @@ static void python_process_tracepoint(struct perf_sample *sample, unsigned int offset, len; unsigned long long val; - if (field->flags & FIELD_IS_ARRAY) { + if (field->flags & TEP_FIELD_IS_ARRAY) { offset = field->offset; len = field->size; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { val = tep_read_number(scripting_context->pevent, data + offset, len); offset = val; len = offset >> 16; offset &= 0xffff; } - if (field->flags & FIELD_IS_STRING && + if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { obj = _PyUnicode_FromString((char *) data + offset); } else { obj = PyByteArray_FromStringAndSize((const char *) data + offset, len); - field->flags &= ~FIELD_IS_STRING; + field->flags &= ~TEP_FIELD_IS_STRING; } } else { /* FIELD_IS_NUMERIC */ obj = get_field_numeric_entry(event, field, data); @@ -1590,8 +1590,8 @@ static int python_stop_script(void) static int python_generate_script(struct tep_handle *pevent, const char *outfile) { - struct event_format *event = NULL; - struct format_field *f; + struct tep_event_format *event = NULL; + struct tep_format_field *f; char fname[PATH_MAX]; int not_first, count; FILE *ofp; @@ -1686,12 +1686,12 @@ static int python_generate_script(struct tep_handle *pevent, const char *outfile count++; fprintf(ofp, "%s=", f->name); - if (f->flags & FIELD_IS_STRING || - f->flags & FIELD_IS_FLAG || - f->flags & FIELD_IS_ARRAY || - f->flags & FIELD_IS_SYMBOLIC) + if (f->flags & TEP_FIELD_IS_STRING || + f->flags & TEP_FIELD_IS_FLAG || + f->flags & TEP_FIELD_IS_ARRAY || + f->flags & TEP_FIELD_IS_SYMBOLIC) fprintf(ofp, "%%s"); - else if (f->flags & FIELD_IS_SIGNED) + else if (f->flags & TEP_FIELD_IS_SIGNED) fprintf(ofp, "%%d"); else fprintf(ofp, "%%u"); @@ -1709,7 +1709,7 @@ static int python_generate_script(struct tep_handle *pevent, const char *outfile if (++count % 5 == 0) fprintf(ofp, "\n\t\t"); - if (f->flags & FIELD_IS_FLAG) { + if (f->flags & TEP_FIELD_IS_FLAG) { if ((count - 1) % 5 != 0) { fprintf(ofp, "\n\t\t"); count = 4; @@ -1719,7 +1719,7 @@ static int python_generate_script(struct tep_handle *pevent, const char *outfile event->name); fprintf(ofp, "\"%s\", %s)", f->name, f->name); - } else if (f->flags & FIELD_IS_SYMBOLIC) { + } else if (f->flags & TEP_FIELD_IS_SYMBOLIC) { if ((count - 1) % 5 != 0) { fprintf(ofp, "\n\t\t"); count = 4; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 8b9369303561..7d2c8ce6cfad 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -199,12 +199,10 @@ void perf_session__delete(struct perf_session *session) free(session); } -static int process_event_synth_tracing_data_stub(struct perf_tool *tool +static int process_event_synth_tracing_data_stub(struct perf_session *session __maybe_unused, union perf_event *event - __maybe_unused, - struct perf_session *session - __maybe_unused) + __maybe_unused) { dump_printf(": unhandled!\n"); return 0; @@ -277,10 +275,8 @@ static int skipn(int fd, off_t n) return 0; } -static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session - __maybe_unused) +static s64 process_event_auxtrace_stub(struct perf_session *session __maybe_unused, + union perf_event *event) { dump_printf(": unhandled!\n"); if (perf_data__is_pipe(session->data)) @@ -288,9 +284,8 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused, return event->auxtrace.size; } -static int process_event_op2_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *session __maybe_unused) +static int process_event_op2_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) { dump_printf(": unhandled!\n"); return 0; @@ -298,9 +293,8 @@ static int process_event_op2_stub(struct perf_tool *tool __maybe_unused, static -int process_event_thread_map_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *session __maybe_unused) +int process_event_thread_map_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) { if (dump_trace) perf_event__fprintf_thread_map(event, stdout); @@ -310,9 +304,8 @@ int process_event_thread_map_stub(struct perf_tool *tool __maybe_unused, } static -int process_event_cpu_map_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *session __maybe_unused) +int process_event_cpu_map_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) { if (dump_trace) perf_event__fprintf_cpu_map(event, stdout); @@ -322,9 +315,8 @@ int process_event_cpu_map_stub(struct perf_tool *tool __maybe_unused, } static -int process_event_stat_config_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *session __maybe_unused) +int process_event_stat_config_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) { if (dump_trace) perf_event__fprintf_stat_config(event, stdout); @@ -333,10 +325,8 @@ int process_event_stat_config_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_stat_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *perf_session - __maybe_unused) +static int process_stat_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) { if (dump_trace) perf_event__fprintf_stat(event, stdout); @@ -345,10 +335,8 @@ static int process_stat_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_stat_round_stub(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_session *perf_session - __maybe_unused) +static int process_stat_round_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) { if (dump_trace) perf_event__fprintf_stat_round(event, stdout); @@ -1374,37 +1362,37 @@ static s64 perf_session__process_user_event(struct perf_session *session, case PERF_RECORD_HEADER_TRACING_DATA: /* setup for reading amidst mmap */ lseek(fd, file_offset, SEEK_SET); - return tool->tracing_data(tool, event, session); + return tool->tracing_data(session, event); case PERF_RECORD_HEADER_BUILD_ID: - return tool->build_id(tool, event, session); + return tool->build_id(session, event); case PERF_RECORD_FINISHED_ROUND: return tool->finished_round(tool, event, oe); case PERF_RECORD_ID_INDEX: - return tool->id_index(tool, event, session); + return tool->id_index(session, event); case PERF_RECORD_AUXTRACE_INFO: - return tool->auxtrace_info(tool, event, session); + return tool->auxtrace_info(session, event); case PERF_RECORD_AUXTRACE: /* setup for reading amidst mmap */ lseek(fd, file_offset + event->header.size, SEEK_SET); - return tool->auxtrace(tool, event, session); + return tool->auxtrace(session, event); case PERF_RECORD_AUXTRACE_ERROR: perf_session__auxtrace_error_inc(session, event); - return tool->auxtrace_error(tool, event, session); + return tool->auxtrace_error(session, event); case PERF_RECORD_THREAD_MAP: - return tool->thread_map(tool, event, session); + return tool->thread_map(session, event); case PERF_RECORD_CPU_MAP: - return tool->cpu_map(tool, event, session); + return tool->cpu_map(session, event); case PERF_RECORD_STAT_CONFIG: - return tool->stat_config(tool, event, session); + return tool->stat_config(session, event); case PERF_RECORD_STAT: - return tool->stat(tool, event, session); + return tool->stat(session, event); case PERF_RECORD_STAT_ROUND: - return tool->stat_round(tool, event, session); + return tool->stat_round(session, event); case PERF_RECORD_TIME_CONV: session->time_conv = event->time_conv; - return tool->time_conv(tool, event, session); + return tool->time_conv(session, event); case PERF_RECORD_HEADER_FEATURE: - return tool->feature(tool, event, session); + return tool->feature(session, event); default: return -EINVAL; } @@ -2133,9 +2121,8 @@ out: return err; } -int perf_event__process_id_index(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_id_index(struct perf_session *session, + union perf_event *event) { struct perf_evlist *evlist = session->evlist; struct id_index_event *ie = &event->id_index; diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index da40b4b380ca..d96eccd7d27f 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -120,9 +120,8 @@ int perf_session__deliver_synth_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample); -int perf_event__process_id_index(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_id_index(struct perf_session *session, + union perf_event *event); int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 97efbcad076e..63f758c655d5 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -5,16 +5,18 @@ from subprocess import Popen, PIPE from re import sub def clang_has_option(option): - return [o for o in Popen(['clang', option], stderr=PIPE).stderr.readlines() if "unknown argument" in o] == [ ] + return [o for o in Popen(['clang', option], stderr=PIPE).stderr.readlines() if b"unknown argument" in o] == [ ] cc = getenv("CC") if cc == "clang": - from _sysconfigdata import build_time_vars - build_time_vars["CFLAGS"] = sub("-specs=[^ ]+", "", build_time_vars["CFLAGS"]) - if not clang_has_option("-mcet"): - build_time_vars["CFLAGS"] = sub("-mcet", "", build_time_vars["CFLAGS"]) - if not clang_has_option("-fcf-protection"): - build_time_vars["CFLAGS"] = sub("-fcf-protection", "", build_time_vars["CFLAGS"]) + from distutils.sysconfig import get_config_vars + vars = get_config_vars() + for var in ('CFLAGS', 'OPT'): + vars[var] = sub("-specs=[^ ]+", "", vars[var]) + if not clang_has_option("-mcet"): + vars[var] = sub("-mcet", "", vars[var]) + if not clang_has_option("-fcf-protection"): + vars[var] = sub("-fcf-protection", "", vars[var]) from distutils.core import setup, Extension @@ -35,7 +37,7 @@ class install_lib(_install_lib): cflags = getenv('CFLAGS', '').split() # switch off several checks (need to be at the end of cflags list) -cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ] +cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter', '-Wno-redundant-decls' ] if cc != "clang": cflags += ['-Wno-cast-function-type' ] diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index b284276ec963..f96c005b3c41 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1884,7 +1884,7 @@ static int __sort_dimension__add_hpp_output(struct sort_dimension *sd, struct hpp_dynamic_entry { struct perf_hpp_fmt hpp; struct perf_evsel *evsel; - struct format_field *field; + struct tep_format_field *field; unsigned dynamic_len; bool raw_trace; }; @@ -1899,7 +1899,7 @@ static int hde_width(struct hpp_dynamic_entry *hde) if (namelen > len) len = namelen; - if (!(hde->field->flags & FIELD_IS_STRING)) { + if (!(hde->field->flags & TEP_FIELD_IS_STRING)) { /* length for print hex numbers */ fieldlen = hde->field->size * 2 + 2; } @@ -1915,7 +1915,7 @@ static void update_dynamic_len(struct hpp_dynamic_entry *hde, struct hist_entry *he) { char *str, *pos; - struct format_field *field = hde->field; + struct tep_format_field *field = hde->field; size_t namelen; bool last = false; @@ -2000,7 +2000,7 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hpp_dynamic_entry *hde; size_t len = fmt->user_len; char *str, *pos; - struct format_field *field; + struct tep_format_field *field; size_t namelen; bool last = false; int ret; @@ -2060,7 +2060,7 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt, struct hist_entry *a, struct hist_entry *b) { struct hpp_dynamic_entry *hde; - struct format_field *field; + struct tep_format_field *field; unsigned offset, size; hde = container_of(fmt, struct hpp_dynamic_entry, hpp); @@ -2071,7 +2071,7 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt, } field = hde->field; - if (field->flags & FIELD_IS_DYNAMIC) { + if (field->flags & TEP_FIELD_IS_DYNAMIC) { unsigned long long dyn; tep_read_number_field(field, a->raw_data, &dyn); @@ -2117,7 +2117,7 @@ static void hde_free(struct perf_hpp_fmt *fmt) } static struct hpp_dynamic_entry * -__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field, +__alloc_dynamic_entry(struct perf_evsel *evsel, struct tep_format_field *field, int level) { struct hpp_dynamic_entry *hde; @@ -2252,7 +2252,7 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam } static int __dynamic_dimension__add(struct perf_evsel *evsel, - struct format_field *field, + struct tep_format_field *field, bool raw_trace, int level) { struct hpp_dynamic_entry *hde; @@ -2270,7 +2270,7 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel, static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace, int level) { int ret; - struct format_field *field; + struct tep_format_field *field; field = evsel->tp_format->format.fields; while (field) { @@ -2305,7 +2305,7 @@ static int add_all_matching_fields(struct perf_evlist *evlist, { int ret = -ESRCH; struct perf_evsel *evsel; - struct format_field *field; + struct tep_format_field *field; evlist__for_each_entry(evlist, evsel) { if (evsel->attr.type != PERF_TYPE_TRACEPOINT) @@ -2327,7 +2327,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok, { char *str, *event_name, *field_name, *opt_name; struct perf_evsel *evsel; - struct format_field *field; + struct tep_format_field *field; bool raw_trace = symbol_conf.raw_trace; int ret = 0; diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 09d6746e6ec8..e767c4a9d4d2 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -85,6 +85,9 @@ static struct symbol *new_inline_sym(struct dso *dso, struct symbol *inline_sym; char *demangled = NULL; + if (!funcname) + funcname = "??"; + if (dso) { demangled = dso__demangle_sym(dso, 0, funcname); if (demangled) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c new file mode 100644 index 000000000000..e7b4c44ebb62 --- /dev/null +++ b/tools/perf/util/stat-display.c @@ -0,0 +1,1166 @@ +#include +#include +#include +#include +#include "evlist.h" +#include "evsel.h" +#include "stat.h" +#include "top.h" +#include "thread_map.h" +#include "cpumap.h" +#include "string2.h" +#include "sane_ctype.h" +#include "cgroup.h" +#include +#include + +#define CNTR_NOT_SUPPORTED "" +#define CNTR_NOT_COUNTED "" + +static bool is_duration_time(struct perf_evsel *evsel) +{ + return !strcmp(evsel->name, "duration_time"); +} + +static void print_running(struct perf_stat_config *config, + u64 run, u64 ena) +{ + if (config->csv_output) { + fprintf(config->output, "%s%" PRIu64 "%s%.2f", + config->csv_sep, + run, + config->csv_sep, + ena ? 100.0 * run / ena : 100.0); + } else if (run != ena) { + fprintf(config->output, " (%.2f%%)", 100.0 * run / ena); + } +} + +static void print_noise_pct(struct perf_stat_config *config, + double total, double avg) +{ + double pct = rel_stddev_stats(total, avg); + + if (config->csv_output) + fprintf(config->output, "%s%.2f%%", config->csv_sep, pct); + else if (pct) + fprintf(config->output, " ( +-%6.2f%% )", pct); +} + +static void print_noise(struct perf_stat_config *config, + struct perf_evsel *evsel, double avg) +{ + struct perf_stat_evsel *ps; + + if (config->run_count == 1) + return; + + ps = evsel->stats; + print_noise_pct(config, stddev_stats(&ps->res_stats[0]), avg); +} + +static void aggr_printout(struct perf_stat_config *config, + struct perf_evsel *evsel, int id, int nr) +{ + switch (config->aggr_mode) { + case AGGR_CORE: + fprintf(config->output, "S%d-C%*d%s%*d%s", + cpu_map__id_to_socket(id), + config->csv_output ? 0 : -8, + cpu_map__id_to_cpu(id), + config->csv_sep, + config->csv_output ? 0 : 4, + nr, + config->csv_sep); + break; + case AGGR_SOCKET: + fprintf(config->output, "S%*d%s%*d%s", + config->csv_output ? 0 : -5, + id, + config->csv_sep, + config->csv_output ? 0 : 4, + nr, + config->csv_sep); + break; + case AGGR_NONE: + fprintf(config->output, "CPU%*d%s", + config->csv_output ? 0 : -4, + perf_evsel__cpus(evsel)->map[id], config->csv_sep); + break; + case AGGR_THREAD: + fprintf(config->output, "%*s-%*d%s", + config->csv_output ? 0 : 16, + thread_map__comm(evsel->threads, id), + config->csv_output ? 0 : -8, + thread_map__pid(evsel->threads, id), + config->csv_sep); + break; + case AGGR_GLOBAL: + case AGGR_UNSET: + default: + break; + } +} + +struct outstate { + FILE *fh; + bool newline; + const char *prefix; + int nfields; + int id, nr; + struct perf_evsel *evsel; +}; + +#define METRIC_LEN 35 + +static void new_line_std(struct perf_stat_config *config __maybe_unused, + void *ctx) +{ + struct outstate *os = ctx; + + os->newline = true; +} + +static void do_new_line_std(struct perf_stat_config *config, + struct outstate *os) +{ + fputc('\n', os->fh); + fputs(os->prefix, os->fh); + aggr_printout(config, os->evsel, os->id, os->nr); + if (config->aggr_mode == AGGR_NONE) + fprintf(os->fh, " "); + fprintf(os->fh, " "); +} + +static void print_metric_std(struct perf_stat_config *config, + void *ctx, const char *color, const char *fmt, + const char *unit, double val) +{ + struct outstate *os = ctx; + FILE *out = os->fh; + int n; + bool newline = os->newline; + + os->newline = false; + + if (unit == NULL || fmt == NULL) { + fprintf(out, "%-*s", METRIC_LEN, ""); + return; + } + + if (newline) + do_new_line_std(config, os); + + n = fprintf(out, " # "); + if (color) + n += color_fprintf(out, color, fmt, val); + else + n += fprintf(out, fmt, val); + fprintf(out, " %-*s", METRIC_LEN - n - 1, unit); +} + +static void new_line_csv(struct perf_stat_config *config, void *ctx) +{ + struct outstate *os = ctx; + int i; + + fputc('\n', os->fh); + if (os->prefix) + fprintf(os->fh, "%s%s", os->prefix, config->csv_sep); + aggr_printout(config, os->evsel, os->id, os->nr); + for (i = 0; i < os->nfields; i++) + fputs(config->csv_sep, os->fh); +} + +static void print_metric_csv(struct perf_stat_config *config __maybe_unused, + void *ctx, + const char *color __maybe_unused, + const char *fmt, const char *unit, double val) +{ + struct outstate *os = ctx; + FILE *out = os->fh; + char buf[64], *vals, *ends; + + if (unit == NULL || fmt == NULL) { + fprintf(out, "%s%s", config->csv_sep, config->csv_sep); + return; + } + snprintf(buf, sizeof(buf), fmt, val); + ends = vals = ltrim(buf); + while (isdigit(*ends) || *ends == '.') + ends++; + *ends = 0; + while (isspace(*unit)) + unit++; + fprintf(out, "%s%s%s%s", config->csv_sep, vals, config->csv_sep, unit); +} + +/* Filter out some columns that don't work well in metrics only mode */ + +static bool valid_only_metric(const char *unit) +{ + if (!unit) + return false; + if (strstr(unit, "/sec") || + strstr(unit, "hz") || + strstr(unit, "Hz") || + strstr(unit, "CPUs utilized")) + return false; + return true; +} + +static const char *fixunit(char *buf, struct perf_evsel *evsel, + const char *unit) +{ + if (!strncmp(unit, "of all", 6)) { + snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel), + unit); + return buf; + } + return unit; +} + +static void print_metric_only(struct perf_stat_config *config, + void *ctx, const char *color, const char *fmt, + const char *unit, double val) +{ + struct outstate *os = ctx; + FILE *out = os->fh; + char buf[1024], str[1024]; + unsigned mlen = config->metric_only_len; + + if (!valid_only_metric(unit)) + return; + unit = fixunit(buf, os->evsel, unit); + if (mlen < strlen(unit)) + mlen = strlen(unit) + 1; + + if (color) + mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1; + + color_snprintf(str, sizeof(str), color ?: "", fmt, val); + fprintf(out, "%*s ", mlen, str); +} + +static void print_metric_only_csv(struct perf_stat_config *config __maybe_unused, + void *ctx, const char *color __maybe_unused, + const char *fmt, + const char *unit, double val) +{ + struct outstate *os = ctx; + FILE *out = os->fh; + char buf[64], *vals, *ends; + char tbuf[1024]; + + if (!valid_only_metric(unit)) + return; + unit = fixunit(tbuf, os->evsel, unit); + snprintf(buf, sizeof buf, fmt, val); + ends = vals = ltrim(buf); + while (isdigit(*ends) || *ends == '.') + ends++; + *ends = 0; + fprintf(out, "%s%s", vals, config->csv_sep); +} + +static void new_line_metric(struct perf_stat_config *config __maybe_unused, + void *ctx __maybe_unused) +{ +} + +static void print_metric_header(struct perf_stat_config *config, + void *ctx, const char *color __maybe_unused, + const char *fmt __maybe_unused, + const char *unit, double val __maybe_unused) +{ + struct outstate *os = ctx; + char tbuf[1024]; + + if (!valid_only_metric(unit)) + return; + unit = fixunit(tbuf, os->evsel, unit); + if (config->csv_output) + fprintf(os->fh, "%s%s", unit, config->csv_sep); + else + fprintf(os->fh, "%*s ", config->metric_only_len, unit); +} + +static int first_shadow_cpu(struct perf_stat_config *config, + struct perf_evsel *evsel, int id) +{ + struct perf_evlist *evlist = evsel->evlist; + int i; + + if (!config->aggr_get_id) + return 0; + + if (config->aggr_mode == AGGR_NONE) + return id; + + if (config->aggr_mode == AGGR_GLOBAL) + return 0; + + for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) { + int cpu2 = perf_evsel__cpus(evsel)->map[i]; + + if (config->aggr_get_id(config, evlist->cpus, cpu2) == id) + return cpu2; + } + return 0; +} + +static void abs_printout(struct perf_stat_config *config, + int id, int nr, struct perf_evsel *evsel, double avg) +{ + FILE *output = config->output; + double sc = evsel->scale; + const char *fmt; + + if (config->csv_output) { + fmt = floor(sc) != sc ? "%.2f%s" : "%.0f%s"; + } else { + if (config->big_num) + fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s"; + else + fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s"; + } + + aggr_printout(config, evsel, id, nr); + + fprintf(output, fmt, avg, config->csv_sep); + + if (evsel->unit) + fprintf(output, "%-*s%s", + config->csv_output ? 0 : config->unit_width, + evsel->unit, config->csv_sep); + + fprintf(output, "%-*s", config->csv_output ? 0 : 25, perf_evsel__name(evsel)); + + if (evsel->cgrp) + fprintf(output, "%s%s", config->csv_sep, evsel->cgrp->name); +} + +static bool is_mixed_hw_group(struct perf_evsel *counter) +{ + struct perf_evlist *evlist = counter->evlist; + u32 pmu_type = counter->attr.type; + struct perf_evsel *pos; + + if (counter->nr_members < 2) + return false; + + evlist__for_each_entry(evlist, pos) { + /* software events can be part of any hardware group */ + if (pos->attr.type == PERF_TYPE_SOFTWARE) + continue; + if (pmu_type == PERF_TYPE_SOFTWARE) { + pmu_type = pos->attr.type; + continue; + } + if (pmu_type != pos->attr.type) + return true; + } + + return false; +} + +static void printout(struct perf_stat_config *config, int id, int nr, + struct perf_evsel *counter, double uval, + char *prefix, u64 run, u64 ena, double noise, + struct runtime_stat *st) +{ + struct perf_stat_output_ctx out; + struct outstate os = { + .fh = config->output, + .prefix = prefix ? prefix : "", + .id = id, + .nr = nr, + .evsel = counter, + }; + print_metric_t pm = print_metric_std; + new_line_t nl; + + if (config->metric_only) { + nl = new_line_metric; + if (config->csv_output) + pm = print_metric_only_csv; + else + pm = print_metric_only; + } else + nl = new_line_std; + + if (config->csv_output && !config->metric_only) { + static int aggr_fields[] = { + [AGGR_GLOBAL] = 0, + [AGGR_THREAD] = 1, + [AGGR_NONE] = 1, + [AGGR_SOCKET] = 2, + [AGGR_CORE] = 2, + }; + + pm = print_metric_csv; + nl = new_line_csv; + os.nfields = 3; + os.nfields += aggr_fields[config->aggr_mode]; + if (counter->cgrp) + os.nfields++; + } + if (run == 0 || ena == 0 || counter->counts->scaled == -1) { + if (config->metric_only) { + pm(config, &os, NULL, "", "", 0); + return; + } + aggr_printout(config, counter, id, nr); + + fprintf(config->output, "%*s%s", + config->csv_output ? 0 : 18, + counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, + config->csv_sep); + + if (counter->supported) { + config->print_free_counters_hint = 1; + if (is_mixed_hw_group(counter)) + config->print_mixed_hw_group_error = 1; + } + + fprintf(config->output, "%-*s%s", + config->csv_output ? 0 : config->unit_width, + counter->unit, config->csv_sep); + + fprintf(config->output, "%*s", + config->csv_output ? 0 : -25, + perf_evsel__name(counter)); + + if (counter->cgrp) + fprintf(config->output, "%s%s", + config->csv_sep, counter->cgrp->name); + + if (!config->csv_output) + pm(config, &os, NULL, NULL, "", 0); + print_noise(config, counter, noise); + print_running(config, run, ena); + if (config->csv_output) + pm(config, &os, NULL, NULL, "", 0); + return; + } + + if (!config->metric_only) + abs_printout(config, id, nr, counter, uval); + + out.print_metric = pm; + out.new_line = nl; + out.ctx = &os; + out.force_header = false; + + if (config->csv_output && !config->metric_only) { + print_noise(config, counter, noise); + print_running(config, run, ena); + } + + perf_stat__print_shadow_stats(config, counter, uval, + first_shadow_cpu(config, counter, id), + &out, &config->metric_events, st); + if (!config->csv_output && !config->metric_only) { + print_noise(config, counter, noise); + print_running(config, run, ena); + } +} + +static void aggr_update_shadow(struct perf_stat_config *config, + struct perf_evlist *evlist) +{ + int cpu, s2, id, s; + u64 val; + struct perf_evsel *counter; + + for (s = 0; s < config->aggr_map->nr; s++) { + id = config->aggr_map->map[s]; + evlist__for_each_entry(evlist, counter) { + val = 0; + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + s2 = config->aggr_get_id(config, evlist->cpus, cpu); + if (s2 != id) + continue; + val += perf_counts(counter->counts, cpu, 0)->val; + } + perf_stat__update_shadow_stats(counter, val, + first_shadow_cpu(config, counter, id), + &rt_stat); + } + } +} + +static void uniquify_event_name(struct perf_evsel *counter) +{ + char *new_name; + char *config; + + if (counter->uniquified_name || + !counter->pmu_name || !strncmp(counter->name, counter->pmu_name, + strlen(counter->pmu_name))) + return; + + config = strchr(counter->name, '/'); + if (config) { + if (asprintf(&new_name, + "%s%s", counter->pmu_name, config) > 0) { + free(counter->name); + counter->name = new_name; + } + } else { + if (asprintf(&new_name, + "%s [%s]", counter->name, counter->pmu_name) > 0) { + free(counter->name); + counter->name = new_name; + } + } + + counter->uniquified_name = true; +} + +static void collect_all_aliases(struct perf_stat_config *config, struct perf_evsel *counter, + void (*cb)(struct perf_stat_config *config, struct perf_evsel *counter, void *data, + bool first), + void *data) +{ + struct perf_evlist *evlist = counter->evlist; + struct perf_evsel *alias; + + alias = list_prepare_entry(counter, &(evlist->entries), node); + list_for_each_entry_continue (alias, &evlist->entries, node) { + if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) || + alias->scale != counter->scale || + alias->cgrp != counter->cgrp || + strcmp(alias->unit, counter->unit) || + perf_evsel__is_clock(alias) != perf_evsel__is_clock(counter)) + break; + alias->merged_stat = true; + cb(config, alias, data, false); + } +} + +static bool collect_data(struct perf_stat_config *config, struct perf_evsel *counter, + void (*cb)(struct perf_stat_config *config, struct perf_evsel *counter, void *data, + bool first), + void *data) +{ + if (counter->merged_stat) + return false; + cb(config, counter, data, true); + if (config->no_merge) + uniquify_event_name(counter); + else if (counter->auto_merge_stats) + collect_all_aliases(config, counter, cb, data); + return true; +} + +struct aggr_data { + u64 ena, run, val; + int id; + int nr; + int cpu; +}; + +static void aggr_cb(struct perf_stat_config *config, + struct perf_evsel *counter, void *data, bool first) +{ + struct aggr_data *ad = data; + int cpu, s2; + + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + struct perf_counts_values *counts; + + s2 = config->aggr_get_id(config, perf_evsel__cpus(counter), cpu); + if (s2 != ad->id) + continue; + if (first) + ad->nr++; + counts = perf_counts(counter->counts, cpu, 0); + /* + * When any result is bad, make them all to give + * consistent output in interval mode. + */ + if (counts->ena == 0 || counts->run == 0 || + counter->counts->scaled == -1) { + ad->ena = 0; + ad->run = 0; + break; + } + ad->val += counts->val; + ad->ena += counts->ena; + ad->run += counts->run; + } +} + +static void print_aggr(struct perf_stat_config *config, + struct perf_evlist *evlist, + char *prefix) +{ + bool metric_only = config->metric_only; + FILE *output = config->output; + struct perf_evsel *counter; + int s, id, nr; + double uval; + u64 ena, run, val; + bool first; + + if (!(config->aggr_map || config->aggr_get_id)) + return; + + aggr_update_shadow(config, evlist); + + /* + * With metric_only everything is on a single line. + * Without each counter has its own line. + */ + for (s = 0; s < config->aggr_map->nr; s++) { + struct aggr_data ad; + if (prefix && metric_only) + fprintf(output, "%s", prefix); + + ad.id = id = config->aggr_map->map[s]; + first = true; + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + + ad.val = ad.ena = ad.run = 0; + ad.nr = 0; + if (!collect_data(config, counter, aggr_cb, &ad)) + continue; + nr = ad.nr; + ena = ad.ena; + run = ad.run; + val = ad.val; + if (first && metric_only) { + first = false; + aggr_printout(config, counter, id, nr); + } + if (prefix && !metric_only) + fprintf(output, "%s", prefix); + + uval = val * counter->scale; + printout(config, id, nr, counter, uval, prefix, + run, ena, 1.0, &rt_stat); + if (!metric_only) + fputc('\n', output); + } + if (metric_only) + fputc('\n', output); + } +} + +static int cmp_val(const void *a, const void *b) +{ + return ((struct perf_aggr_thread_value *)b)->val - + ((struct perf_aggr_thread_value *)a)->val; +} + +static struct perf_aggr_thread_value *sort_aggr_thread( + struct perf_evsel *counter, + int nthreads, int ncpus, + int *ret, + struct target *_target) +{ + int cpu, thread, i = 0; + double uval; + struct perf_aggr_thread_value *buf; + + buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value)); + if (!buf) + return NULL; + + for (thread = 0; thread < nthreads; thread++) { + u64 ena = 0, run = 0, val = 0; + + for (cpu = 0; cpu < ncpus; cpu++) { + val += perf_counts(counter->counts, cpu, thread)->val; + ena += perf_counts(counter->counts, cpu, thread)->ena; + run += perf_counts(counter->counts, cpu, thread)->run; + } + + uval = val * counter->scale; + + /* + * Skip value 0 when enabling --per-thread globally, + * otherwise too many 0 output. + */ + if (uval == 0.0 && target__has_per_thread(_target)) + continue; + + buf[i].counter = counter; + buf[i].id = thread; + buf[i].uval = uval; + buf[i].val = val; + buf[i].run = run; + buf[i].ena = ena; + i++; + } + + qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val); + + if (ret) + *ret = i; + + return buf; +} + +static void print_aggr_thread(struct perf_stat_config *config, + struct target *_target, + struct perf_evsel *counter, char *prefix) +{ + FILE *output = config->output; + int nthreads = thread_map__nr(counter->threads); + int ncpus = cpu_map__nr(counter->cpus); + int thread, sorted_threads, id; + struct perf_aggr_thread_value *buf; + + buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads, _target); + if (!buf) { + perror("cannot sort aggr thread"); + return; + } + + for (thread = 0; thread < sorted_threads; thread++) { + if (prefix) + fprintf(output, "%s", prefix); + + id = buf[thread].id; + if (config->stats) + printout(config, id, 0, buf[thread].counter, buf[thread].uval, + prefix, buf[thread].run, buf[thread].ena, 1.0, + &config->stats[id]); + else + printout(config, id, 0, buf[thread].counter, buf[thread].uval, + prefix, buf[thread].run, buf[thread].ena, 1.0, + &rt_stat); + fputc('\n', output); + } + + free(buf); +} + +struct caggr_data { + double avg, avg_enabled, avg_running; +}; + +static void counter_aggr_cb(struct perf_stat_config *config __maybe_unused, + struct perf_evsel *counter, void *data, + bool first __maybe_unused) +{ + struct caggr_data *cd = data; + struct perf_stat_evsel *ps = counter->stats; + + cd->avg += avg_stats(&ps->res_stats[0]); + cd->avg_enabled += avg_stats(&ps->res_stats[1]); + cd->avg_running += avg_stats(&ps->res_stats[2]); +} + +/* + * Print out the results of a single counter: + * aggregated counts in system-wide mode + */ +static void print_counter_aggr(struct perf_stat_config *config, + struct perf_evsel *counter, char *prefix) +{ + bool metric_only = config->metric_only; + FILE *output = config->output; + double uval; + struct caggr_data cd = { .avg = 0.0 }; + + if (!collect_data(config, counter, counter_aggr_cb, &cd)) + return; + + if (prefix && !metric_only) + fprintf(output, "%s", prefix); + + uval = cd.avg * counter->scale; + printout(config, -1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, + cd.avg, &rt_stat); + if (!metric_only) + fprintf(output, "\n"); +} + +static void counter_cb(struct perf_stat_config *config __maybe_unused, + struct perf_evsel *counter, void *data, + bool first __maybe_unused) +{ + struct aggr_data *ad = data; + + ad->val += perf_counts(counter->counts, ad->cpu, 0)->val; + ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena; + ad->run += perf_counts(counter->counts, ad->cpu, 0)->run; +} + +/* + * Print out the results of a single counter: + * does not use aggregated count in system-wide + */ +static void print_counter(struct perf_stat_config *config, + struct perf_evsel *counter, char *prefix) +{ + FILE *output = config->output; + u64 ena, run, val; + double uval; + int cpu; + + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { + struct aggr_data ad = { .cpu = cpu }; + + if (!collect_data(config, counter, counter_cb, &ad)) + return; + val = ad.val; + ena = ad.ena; + run = ad.run; + + if (prefix) + fprintf(output, "%s", prefix); + + uval = val * counter->scale; + printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0, + &rt_stat); + + fputc('\n', output); + } +} + +static void print_no_aggr_metric(struct perf_stat_config *config, + struct perf_evlist *evlist, + char *prefix) +{ + int cpu; + int nrcpus = 0; + struct perf_evsel *counter; + u64 ena, run, val; + double uval; + + nrcpus = evlist->cpus->nr; + for (cpu = 0; cpu < nrcpus; cpu++) { + bool first = true; + + if (prefix) + fputs(prefix, config->output); + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + if (first) { + aggr_printout(config, counter, cpu, 0); + first = false; + } + val = perf_counts(counter->counts, cpu, 0)->val; + ena = perf_counts(counter->counts, cpu, 0)->ena; + run = perf_counts(counter->counts, cpu, 0)->run; + + uval = val * counter->scale; + printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0, + &rt_stat); + } + fputc('\n', config->output); + } +} + +static int aggr_header_lens[] = { + [AGGR_CORE] = 18, + [AGGR_SOCKET] = 12, + [AGGR_NONE] = 6, + [AGGR_THREAD] = 24, + [AGGR_GLOBAL] = 0, +}; + +static const char *aggr_header_csv[] = { + [AGGR_CORE] = "core,cpus,", + [AGGR_SOCKET] = "socket,cpus", + [AGGR_NONE] = "cpu,", + [AGGR_THREAD] = "comm-pid,", + [AGGR_GLOBAL] = "" +}; + +static void print_metric_headers(struct perf_stat_config *config, + struct perf_evlist *evlist, + const char *prefix, bool no_indent) +{ + struct perf_stat_output_ctx out; + struct perf_evsel *counter; + struct outstate os = { + .fh = config->output + }; + + if (prefix) + fprintf(config->output, "%s", prefix); + + if (!config->csv_output && !no_indent) + fprintf(config->output, "%*s", + aggr_header_lens[config->aggr_mode], ""); + if (config->csv_output) { + if (config->interval) + fputs("time,", config->output); + fputs(aggr_header_csv[config->aggr_mode], config->output); + } + + /* Print metrics headers only */ + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + os.evsel = counter; + out.ctx = &os; + out.print_metric = print_metric_header; + out.new_line = new_line_metric; + out.force_header = true; + os.evsel = counter; + perf_stat__print_shadow_stats(config, counter, 0, + 0, + &out, + &config->metric_events, + &rt_stat); + } + fputc('\n', config->output); +} + +static void print_interval(struct perf_stat_config *config, + struct perf_evlist *evlist, + char *prefix, struct timespec *ts) +{ + bool metric_only = config->metric_only; + unsigned int unit_width = config->unit_width; + FILE *output = config->output; + static int num_print_interval; + + if (config->interval_clear) + puts(CONSOLE_CLEAR); + + sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep); + + if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) { + switch (config->aggr_mode) { + case AGGR_SOCKET: + fprintf(output, "# time socket cpus"); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + break; + case AGGR_CORE: + fprintf(output, "# time core cpus"); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + break; + case AGGR_NONE: + fprintf(output, "# time CPU "); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + break; + case AGGR_THREAD: + fprintf(output, "# time comm-pid"); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + break; + case AGGR_GLOBAL: + default: + fprintf(output, "# time"); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + case AGGR_UNSET: + break; + } + } + + if ((num_print_interval == 0 || config->interval_clear) && metric_only) + print_metric_headers(config, evlist, " ", true); + if (++num_print_interval == 25) + num_print_interval = 0; +} + +static void print_header(struct perf_stat_config *config, + struct target *_target, + int argc, const char **argv) +{ + FILE *output = config->output; + int i; + + fflush(stdout); + + if (!config->csv_output) { + fprintf(output, "\n"); + fprintf(output, " Performance counter stats for "); + if (_target->system_wide) + fprintf(output, "\'system wide"); + else if (_target->cpu_list) + fprintf(output, "\'CPU(s) %s", _target->cpu_list); + else if (!target__has_task(_target)) { + fprintf(output, "\'%s", argv ? argv[0] : "pipe"); + for (i = 1; argv && (i < argc); i++) + fprintf(output, " %s", argv[i]); + } else if (_target->pid) + fprintf(output, "process id \'%s", _target->pid); + else + fprintf(output, "thread id \'%s", _target->tid); + + fprintf(output, "\'"); + if (config->run_count > 1) + fprintf(output, " (%d runs)", config->run_count); + fprintf(output, ":\n\n"); + } +} + +static int get_precision(double num) +{ + if (num > 1) + return 0; + + return lround(ceil(-log10(num))); +} + +static void print_table(struct perf_stat_config *config, + FILE *output, int precision, double avg) +{ + char tmp[64]; + int idx, indent = 0; + + scnprintf(tmp, 64, " %17.*f", precision, avg); + while (tmp[indent] == ' ') + indent++; + + fprintf(output, "%*s# Table of individual measurements:\n", indent, ""); + + for (idx = 0; idx < config->run_count; idx++) { + double run = (double) config->walltime_run[idx] / NSEC_PER_SEC; + int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5); + + fprintf(output, " %17.*f (%+.*f) ", + precision, run, precision, run - avg); + + for (h = 0; h < n; h++) + fprintf(output, "#"); + + fprintf(output, "\n"); + } + + fprintf(output, "\n%*s# Final result:\n", indent, ""); +} + +static double timeval2double(struct timeval *t) +{ + return t->tv_sec + (double) t->tv_usec/USEC_PER_SEC; +} + +static void print_footer(struct perf_stat_config *config) +{ + double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; + FILE *output = config->output; + int n; + + if (!config->null_run) + fprintf(output, "\n"); + + if (config->run_count == 1) { + fprintf(output, " %17.9f seconds time elapsed", avg); + + if (config->ru_display) { + double ru_utime = timeval2double(&config->ru_data.ru_utime); + double ru_stime = timeval2double(&config->ru_data.ru_stime); + + fprintf(output, "\n\n"); + fprintf(output, " %17.9f seconds user\n", ru_utime); + fprintf(output, " %17.9f seconds sys\n", ru_stime); + } + } else { + double sd = stddev_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; + /* + * Display at most 2 more significant + * digits than the stddev inaccuracy. + */ + int precision = get_precision(sd) + 2; + + if (config->walltime_run_table) + print_table(config, output, precision, avg); + + fprintf(output, " %17.*f +- %.*f seconds time elapsed", + precision, avg, precision, sd); + + print_noise_pct(config, sd, avg); + } + fprintf(output, "\n\n"); + + if (config->print_free_counters_hint && + sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 && + n > 0) + fprintf(output, +"Some events weren't counted. Try disabling the NMI watchdog:\n" +" echo 0 > /proc/sys/kernel/nmi_watchdog\n" +" perf stat ...\n" +" echo 1 > /proc/sys/kernel/nmi_watchdog\n"); + + if (config->print_mixed_hw_group_error) + fprintf(output, + "The events in group usually have to be from " + "the same PMU. Try reorganizing the group.\n"); +} + +void +perf_evlist__print_counters(struct perf_evlist *evlist, + struct perf_stat_config *config, + struct target *_target, + struct timespec *ts, + int argc, const char **argv) +{ + bool metric_only = config->metric_only; + int interval = config->interval; + struct perf_evsel *counter; + char buf[64], *prefix = NULL; + + if (interval) + print_interval(config, evlist, prefix = buf, ts); + else + print_header(config, _target, argc, argv); + + if (metric_only) { + static int num_print_iv; + + if (num_print_iv == 0 && !interval) + print_metric_headers(config, evlist, prefix, false); + if (num_print_iv++ == 25) + num_print_iv = 0; + if (config->aggr_mode == AGGR_GLOBAL && prefix) + fprintf(config->output, "%s", prefix); + } + + switch (config->aggr_mode) { + case AGGR_CORE: + case AGGR_SOCKET: + print_aggr(config, evlist, prefix); + break; + case AGGR_THREAD: + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + print_aggr_thread(config, _target, counter, prefix); + } + break; + case AGGR_GLOBAL: + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + print_counter_aggr(config, counter, prefix); + } + if (metric_only) + fputc('\n', config->output); + break; + case AGGR_NONE: + if (metric_only) + print_no_aggr_metric(config, evlist, prefix); + else { + evlist__for_each_entry(evlist, counter) { + if (is_duration_time(counter)) + continue; + print_counter(config, counter, prefix); + } + } + break; + case AGGR_UNSET: + default: + break; + } + + if (!interval && !config->csv_output) + print_footer(config); + + fflush(config->output); +} diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 99990f5f2512..8ad32763cfff 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -410,7 +410,8 @@ static double runtime_stat_n(struct runtime_stat *st, return v->stats.n; } -static void print_stalled_cycles_frontend(int cpu, +static void print_stalled_cycles_frontend(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st) @@ -427,13 +428,14 @@ static void print_stalled_cycles_frontend(int cpu, color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); if (ratio) - out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle", + out->print_metric(config, out->ctx, color, "%7.2f%%", "frontend cycles idle", ratio); else - out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0); + out->print_metric(config, out->ctx, NULL, NULL, "frontend cycles idle", 0); } -static void print_stalled_cycles_backend(int cpu, +static void print_stalled_cycles_backend(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, struct runtime_stat *st) @@ -449,10 +451,11 @@ static void print_stalled_cycles_backend(int cpu, color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "backend cycles idle", ratio); } -static void print_branch_misses(int cpu, +static void print_branch_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -469,10 +472,11 @@ static void print_branch_misses(int cpu, color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all branches", ratio); } -static void print_l1_dcache_misses(int cpu, +static void print_l1_dcache_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -490,10 +494,11 @@ static void print_l1_dcache_misses(int cpu, color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio); } -static void print_l1_icache_misses(int cpu, +static void print_l1_icache_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -510,10 +515,11 @@ static void print_l1_icache_misses(int cpu, ratio = avg / total * 100.0; color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio); } -static void print_dtlb_cache_misses(int cpu, +static void print_dtlb_cache_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -529,10 +535,11 @@ static void print_dtlb_cache_misses(int cpu, ratio = avg / total * 100.0; color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio); } -static void print_itlb_cache_misses(int cpu, +static void print_itlb_cache_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -548,10 +555,11 @@ static void print_itlb_cache_misses(int cpu, ratio = avg / total * 100.0; color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio); } -static void print_ll_cache_misses(int cpu, +static void print_ll_cache_misses(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, double avg, struct perf_stat_output_ctx *out, @@ -567,7 +575,7 @@ static void print_ll_cache_misses(int cpu, ratio = avg / total * 100.0; color = get_ratio_color(GRC_CACHE_MISSES, ratio); - out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); + out->print_metric(config, out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); } /* @@ -674,7 +682,8 @@ static double td_be_bound(int ctx, int cpu, struct runtime_stat *st) return sanitize_val(1.0 - sum); } -static void print_smi_cost(int cpu, struct perf_evsel *evsel, +static void print_smi_cost(struct perf_stat_config *config, + int cpu, struct perf_evsel *evsel, struct perf_stat_output_ctx *out, struct runtime_stat *st) { @@ -694,11 +703,12 @@ static void print_smi_cost(int cpu, struct perf_evsel *evsel, if (cost > 10) color = PERF_COLOR_RED; - out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost); - out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num); + out->print_metric(config, out->ctx, color, "%8.1f%%", "SMI cycles%", cost); + out->print_metric(config, out->ctx, NULL, "%4.0f", "SMI#", smi_num); } -static void generic_metric(const char *metric_expr, +static void generic_metric(struct perf_stat_config *config, + const char *metric_expr, struct perf_evsel **metric_events, char *name, const char *metric_name, @@ -737,20 +747,21 @@ static void generic_metric(const char *metric_expr, const char *p = metric_expr; if (expr__parse(&ratio, &pctx, &p) == 0) - print_metric(ctxp, NULL, "%8.1f", + print_metric(config, ctxp, NULL, "%8.1f", metric_name ? metric_name : out->force_header ? name : "", ratio); else - print_metric(ctxp, NULL, NULL, + print_metric(config, ctxp, NULL, NULL, out->force_header ? (metric_name ? metric_name : name) : "", 0); } else - print_metric(ctxp, NULL, NULL, "", 0); + print_metric(config, ctxp, NULL, NULL, "", 0); } -void perf_stat__print_shadow_stats(struct perf_evsel *evsel, +void perf_stat__print_shadow_stats(struct perf_stat_config *config, + struct perf_evsel *evsel, double avg, int cpu, struct perf_stat_output_ctx *out, struct rblist *metric_events, @@ -769,10 +780,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, if (total) { ratio = avg / total; - print_metric(ctxp, NULL, "%7.2f ", + print_metric(config, ctxp, NULL, "%7.2f ", "insn per cycle", ratio); } else { - print_metric(ctxp, NULL, NULL, "insn per cycle", 0); + print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0); } total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, @@ -783,20 +794,20 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ctx, cpu)); if (total && avg) { - out->new_line(ctxp); + out->new_line(config, ctxp); ratio = total / avg; - print_metric(ctxp, NULL, "%7.2f ", + print_metric(config, ctxp, NULL, "%7.2f ", "stalled cycles per insn", ratio); } else if (have_frontend_stalled) { - print_metric(ctxp, NULL, NULL, + print_metric(config, ctxp, NULL, NULL, "stalled cycles per insn", 0); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0) - print_branch_misses(cpu, evsel, avg, out, st); + print_branch_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all branches", 0); + print_metric(config, ctxp, NULL, NULL, "of all branches", 0); } else if ( evsel->attr.type == PERF_TYPE_HW_CACHE && evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | @@ -804,9 +815,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0) - print_l1_dcache_misses(cpu, evsel, avg, out, st); + print_l1_dcache_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0); + print_metric(config, ctxp, NULL, NULL, "of all L1-dcache hits", 0); } else if ( evsel->attr.type == PERF_TYPE_HW_CACHE && evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | @@ -814,9 +825,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0) - print_l1_icache_misses(cpu, evsel, avg, out, st); + print_l1_icache_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0); + print_metric(config, ctxp, NULL, NULL, "of all L1-icache hits", 0); } else if ( evsel->attr.type == PERF_TYPE_HW_CACHE && evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | @@ -824,9 +835,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0) - print_dtlb_cache_misses(cpu, evsel, avg, out, st); + print_dtlb_cache_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0); + print_metric(config, ctxp, NULL, NULL, "of all dTLB cache hits", 0); } else if ( evsel->attr.type == PERF_TYPE_HW_CACHE && evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | @@ -834,9 +845,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0) - print_itlb_cache_misses(cpu, evsel, avg, out, st); + print_itlb_cache_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0); + print_metric(config, ctxp, NULL, NULL, "of all iTLB cache hits", 0); } else if ( evsel->attr.type == PERF_TYPE_HW_CACHE && evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | @@ -844,9 +855,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0) - print_ll_cache_misses(cpu, evsel, avg, out, st); + print_ll_cache_misses(config, cpu, evsel, avg, out, st); else - print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0); + print_metric(config, ctxp, NULL, NULL, "of all LL-cache hits", 0); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu); @@ -854,32 +865,32 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ratio = avg * 100 / total; if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0) - print_metric(ctxp, NULL, "%8.3f %%", + print_metric(config, ctxp, NULL, "%8.3f %%", "of all cache refs", ratio); else - print_metric(ctxp, NULL, NULL, "of all cache refs", 0); + print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { - print_stalled_cycles_frontend(cpu, evsel, avg, out, st); + print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles_backend(cpu, evsel, avg, out, st); + print_stalled_cycles_backend(config, cpu, evsel, avg, out, st); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); if (total) { ratio = avg / total; - print_metric(ctxp, NULL, "%8.3f", "GHz", ratio); + print_metric(config, ctxp, NULL, "%8.3f", "GHz", ratio); } else { - print_metric(ctxp, NULL, NULL, "Ghz", 0); + print_metric(config, ctxp, NULL, NULL, "Ghz", 0); } } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); if (total) - print_metric(ctxp, NULL, + print_metric(config, ctxp, NULL, "%7.2f%%", "transactional cycles", 100.0 * (avg / total)); else - print_metric(ctxp, NULL, NULL, "transactional cycles", + print_metric(config, ctxp, NULL, NULL, "transactional cycles", 0); } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); @@ -888,10 +899,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, if (total2 < avg) total2 = avg; if (total) - print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles", + print_metric(config, ctxp, NULL, "%7.2f%%", "aborted cycles", 100.0 * ((total2-avg) / total)); else - print_metric(ctxp, NULL, NULL, "aborted cycles", 0); + print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0); } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu); @@ -900,10 +911,10 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, ratio = total / avg; if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0) - print_metric(ctxp, NULL, "%8.0f", + print_metric(config, ctxp, NULL, "%8.0f", "cycles / transaction", ratio); else - print_metric(ctxp, NULL, NULL, "cycles / transaction", + print_metric(config, ctxp, NULL, NULL, "cycles / transaction", 0); } else if (perf_stat_evsel__is(evsel, ELISION_START)) { total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, @@ -912,33 +923,33 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, if (avg) ratio = total / avg; - print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio); + print_metric(config, ctxp, NULL, "%8.0f", "cycles / elision", ratio); } else if (perf_evsel__is_clock(evsel)) { if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0) - print_metric(ctxp, NULL, "%8.3f", "CPUs utilized", + print_metric(config, ctxp, NULL, "%8.3f", "CPUs utilized", avg / (ratio * evsel->scale)); else - print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); + print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { double fe_bound = td_fe_bound(ctx, cpu, st); if (fe_bound > 0.2) color = PERF_COLOR_RED; - print_metric(ctxp, color, "%8.1f%%", "frontend bound", + print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { double retiring = td_retiring(ctx, cpu, st); if (retiring > 0.7) color = PERF_COLOR_GREEN; - print_metric(ctxp, color, "%8.1f%%", "retiring", + print_metric(config, ctxp, color, "%8.1f%%", "retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { double bad_spec = td_bad_spec(ctx, cpu, st); if (bad_spec > 0.1) color = PERF_COLOR_RED; - print_metric(ctxp, color, "%8.1f%%", "bad speculation", + print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", bad_spec * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { double be_bound = td_be_bound(ctx, cpu, st); @@ -955,12 +966,12 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, if (be_bound > 0.2) color = PERF_COLOR_RED; if (td_total_slots(ctx, cpu, st) > 0) - print_metric(ctxp, color, "%8.1f%%", name, + print_metric(config, ctxp, color, "%8.1f%%", name, be_bound * 100.); else - print_metric(ctxp, NULL, NULL, name, 0); + print_metric(config, ctxp, NULL, NULL, name, 0); } else if (evsel->metric_expr) { - generic_metric(evsel->metric_expr, evsel->metric_events, evsel->name, + generic_metric(config, evsel->metric_expr, evsel->metric_events, evsel->name, evsel->metric_name, avg, cpu, out, st); } else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) { char unit = 'M'; @@ -975,9 +986,9 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, unit = 'K'; } snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); - print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio); + print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { - print_smi_cost(cpu, evsel, out, st); + print_smi_cost(config, cpu, evsel, out, st); } else { num = 0; } @@ -987,12 +998,12 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel, list_for_each_entry (mexp, &me->head, nd) { if (num++ > 0) - out->new_line(ctxp); - generic_metric(mexp->metric_expr, mexp->metric_events, + out->new_line(config, ctxp); + generic_metric(config, mexp->metric_expr, mexp->metric_events, evsel->name, mexp->metric_name, avg, cpu, out, st); } } if (num == 0) - print_metric(ctxp, NULL, NULL, NULL, 0); + print_metric(config, ctxp, NULL, NULL, NULL, 0); } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index a0061e0b0fad..4d40515307b8 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -374,9 +374,8 @@ int perf_stat_process_counter(struct perf_stat_config *config, return 0; } -int perf_event__process_stat_event(struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_session *session) +int perf_event__process_stat_event(struct perf_session *session, + union perf_event *event) { struct perf_counts_values count; struct stat_event *st = &event->stat; @@ -435,3 +434,98 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp) return ret; } + +int create_perf_stat_counter(struct perf_evsel *evsel, + struct perf_stat_config *config, + struct target *target) +{ + struct perf_event_attr *attr = &evsel->attr; + struct perf_evsel *leader = evsel->leader; + + if (config->scale) { + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + } + + /* + * The event is part of non trivial group, let's enable + * the group read (for leader) and ID retrieval for all + * members. + */ + if (leader->nr_members > 1) + attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP; + + attr->inherit = !config->no_inherit; + + /* + * Some events get initialized with sample_(period/type) set, + * like tracepoints. Clear it up for counting. + */ + attr->sample_period = 0; + + if (config->identifier) + attr->sample_type = PERF_SAMPLE_IDENTIFIER; + + /* + * Disabling all counters initially, they will be enabled + * either manually by us or by kernel via enable_on_exec + * set later. + */ + if (perf_evsel__is_group_leader(evsel)) { + attr->disabled = 1; + + /* + * In case of initial_delay we enable tracee + * events manually. + */ + if (target__none(target) && !config->initial_delay) + attr->enable_on_exec = 1; + } + + if (target__has_cpu(target) && !target__has_per_thread(target)) + return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); + + return perf_evsel__open_per_thread(evsel, evsel->threads); +} + +int perf_stat_synthesize_config(struct perf_stat_config *config, + struct perf_tool *tool, + struct perf_evlist *evlist, + perf_event__handler_t process, + bool attrs) +{ + int err; + + if (attrs) { + err = perf_event__synthesize_attrs(tool, evlist, process); + if (err < 0) { + pr_err("Couldn't synthesize attrs.\n"); + return err; + } + } + + err = perf_event__synthesize_extra_attr(tool, evlist, process, + attrs); + + err = perf_event__synthesize_thread_map2(tool, evlist->threads, + process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize thread map.\n"); + return err; + } + + err = perf_event__synthesize_cpu_map(tool, evlist->cpus, + process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize thread map.\n"); + return err; + } + + err = perf_event__synthesize_stat_config(tool, config, process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize config.\n"); + return err; + } + + return 0; +} diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 36efb986f7fc..2f9c9159a364 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -4,8 +4,14 @@ #include #include +#include +#include +#include +#include #include "xyarray.h" #include "rblist.h" +#include "perf.h" +#include "event.h" struct stats { double n, mean, M2; @@ -84,15 +90,42 @@ struct runtime_stat { struct rblist value_list; }; +typedef int (*aggr_get_id_t)(struct perf_stat_config *config, + struct cpu_map *m, int cpu); + struct perf_stat_config { - enum aggr_mode aggr_mode; - bool scale; - FILE *output; - unsigned int interval; - unsigned int timeout; - int times; - struct runtime_stat *stats; - int stats_num; + enum aggr_mode aggr_mode; + bool scale; + bool no_inherit; + bool identifier; + bool csv_output; + bool interval_clear; + bool metric_only; + bool null_run; + bool ru_display; + bool big_num; + bool no_merge; + bool walltime_run_table; + FILE *output; + unsigned int interval; + unsigned int timeout; + unsigned int initial_delay; + unsigned int unit_width; + unsigned int metric_only_len; + int times; + int run_count; + int print_free_counters_hint; + int print_mixed_hw_group_error; + struct runtime_stat *stats; + int stats_num; + const char *csv_sep; + struct stats *walltime_nsecs_stats; + struct rusage ru_data; + struct cpu_map *aggr_map; + aggr_get_id_t aggr_get_id; + struct cpu_map *cpus_aggr_map; + u64 *walltime_run; + struct rblist metric_events; }; void update_stats(struct stats *stats, u64 val); @@ -130,9 +163,10 @@ bool __perf_evsel_stat__is(struct perf_evsel *evsel, extern struct runtime_stat rt_stat; extern struct stats walltime_nsecs_stats; -typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit, +typedef void (*print_metric_t)(struct perf_stat_config *config, + void *ctx, const char *color, const char *unit, const char *fmt, double val); -typedef void (*new_line_t )(void *ctx); +typedef void (*new_line_t)(struct perf_stat_config *config, void *ctx); void runtime_stat__init(struct runtime_stat *st); void runtime_stat__exit(struct runtime_stat *st); @@ -148,7 +182,8 @@ struct perf_stat_output_ctx { bool force_header; }; -void perf_stat__print_shadow_stats(struct perf_evsel *evsel, +void perf_stat__print_shadow_stats(struct perf_stat_config *config, + struct perf_evsel *evsel, double avg, int cpu, struct perf_stat_output_ctx *out, struct rblist *metric_events, @@ -164,11 +199,25 @@ int perf_stat_process_counter(struct perf_stat_config *config, struct perf_tool; union perf_event; struct perf_session; -int perf_event__process_stat_event(struct perf_tool *tool, - union perf_event *event, - struct perf_session *session); +int perf_event__process_stat_event(struct perf_session *session, + union perf_event *event); size_t perf_event__fprintf_stat(union perf_event *event, FILE *fp); size_t perf_event__fprintf_stat_round(union perf_event *event, FILE *fp); size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp); + +int create_perf_stat_counter(struct perf_evsel *evsel, + struct perf_stat_config *config, + struct target *target); +int perf_stat_synthesize_config(struct perf_stat_config *config, + struct perf_tool *tool, + struct perf_evlist *evlist, + perf_event__handler_t process, + bool attrs); +void +perf_evlist__print_counters(struct perf_evlist *evlist, + struct perf_stat_config *config, + struct target *_target, + struct timespec *ts, + int argc, const char **argv); #endif diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c index 3d1cf5bf7f18..9005fbe0780e 100644 --- a/tools/perf/util/strbuf.c +++ b/tools/perf/util/strbuf.c @@ -98,19 +98,25 @@ static int strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap) va_copy(ap_saved, ap); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); - if (len < 0) + if (len < 0) { + va_end(ap_saved); return len; + } if (len > strbuf_avail(sb)) { ret = strbuf_grow(sb, len); - if (ret) + if (ret) { + va_end(ap_saved); return ret; + } len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap_saved); va_end(ap_saved); if (len > strbuf_avail(sb)) { pr_debug("this should not happen, your vsnprintf is broken"); + va_end(ap_saved); return -EINVAL; } } + va_end(ap_saved); return strbuf_setlen(sb, sb->len + len); } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 29770ea61768..66a84d5846c8 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -324,7 +324,17 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) plt_entry_size = 16; break; - default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa need to be checked */ + case EM_SPARC: + plt_header_size = 48; + plt_entry_size = 12; + break; + + case EM_SPARCV9: + plt_header_size = 128; + plt_entry_size = 32; + break; + + default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/xtensa need to be checked */ plt_header_size = shdr_plt.sh_entsize; plt_entry_size = shdr_plt.sh_entsize; break; @@ -1947,6 +1957,34 @@ void kcore_extract__delete(struct kcore_extract *kce) } #ifdef HAVE_GELF_GETNOTE_SUPPORT + +static void sdt_adjust_loc(struct sdt_note *tmp, GElf_Addr base_off) +{ + if (!base_off) + return; + + if (tmp->bit32) + tmp->addr.a32[SDT_NOTE_IDX_LOC] = + tmp->addr.a32[SDT_NOTE_IDX_LOC] + base_off - + tmp->addr.a32[SDT_NOTE_IDX_BASE]; + else + tmp->addr.a64[SDT_NOTE_IDX_LOC] = + tmp->addr.a64[SDT_NOTE_IDX_LOC] + base_off - + tmp->addr.a64[SDT_NOTE_IDX_BASE]; +} + +static void sdt_adjust_refctr(struct sdt_note *tmp, GElf_Addr base_addr, + GElf_Addr base_off) +{ + if (!base_off) + return; + + if (tmp->bit32 && tmp->addr.a32[SDT_NOTE_IDX_REFCTR]) + tmp->addr.a32[SDT_NOTE_IDX_REFCTR] -= (base_addr - base_off); + else if (tmp->addr.a64[SDT_NOTE_IDX_REFCTR]) + tmp->addr.a64[SDT_NOTE_IDX_REFCTR] -= (base_addr - base_off); +} + /** * populate_sdt_note : Parse raw data and identify SDT note * @elf: elf of the opened file @@ -1964,7 +2002,6 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len, const char *provider, *name, *args; struct sdt_note *tmp = NULL; GElf_Ehdr ehdr; - GElf_Addr base_off = 0; GElf_Shdr shdr; int ret = -EINVAL; @@ -2060,17 +2097,12 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len, * base address in the description of the SDT note. If its different, * then accordingly, adjust the note location. */ - if (elf_section_by_name(*elf, &ehdr, &shdr, SDT_BASE_SCN, NULL)) { - base_off = shdr.sh_offset; - if (base_off) { - if (tmp->bit32) - tmp->addr.a32[0] = tmp->addr.a32[0] + base_off - - tmp->addr.a32[1]; - else - tmp->addr.a64[0] = tmp->addr.a64[0] + base_off - - tmp->addr.a64[1]; - } - } + if (elf_section_by_name(*elf, &ehdr, &shdr, SDT_BASE_SCN, NULL)) + sdt_adjust_loc(tmp, shdr.sh_offset); + + /* Adjust reference counter offset */ + if (elf_section_by_name(*elf, &ehdr, &shdr, SDT_PROBES_SCN, NULL)) + sdt_adjust_refctr(tmp, shdr.sh_addr, shdr.sh_offset); list_add_tail(&tmp->note_list, sdt_notes); return 0; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index f25fae4b5743..d026d215bdc6 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -123,7 +123,8 @@ struct symbol_conf { const char *vmlinux_name, *kallsyms_name, *source_prefix, - *field_sep; + *field_sep, + *graph_function; const char *default_guest_vmlinux_name, *default_guest_kallsyms, *default_guest_modules; @@ -379,12 +380,19 @@ int get_sdt_note_list(struct list_head *head, const char *target); int cleanup_sdt_note_list(struct list_head *sdt_notes); int sdt_notes__get_count(struct list_head *start); +#define SDT_PROBES_SCN ".probes" #define SDT_BASE_SCN ".stapsdt.base" #define SDT_NOTE_SCN ".note.stapsdt" #define SDT_NOTE_TYPE 3 #define SDT_NOTE_NAME "stapsdt" #define NR_ADDR 3 +enum { + SDT_NOTE_IDX_LOC = 0, + SDT_NOTE_IDX_BASE, + SDT_NOTE_IDX_REFCTR, +}; + struct mem_info *mem_info__new(void); struct mem_info *mem_info__get(struct mem_info *mi); void mem_info__put(struct mem_info *mi); diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index dd17d6a38d3a..61a4286a74dc 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -36,6 +36,7 @@ * @branch_count: the branch count when the entry was created * @cp: call path * @no_call: a 'call' was not seen + * @trace_end: a 'call' but trace ended */ struct thread_stack_entry { u64 ret_addr; @@ -44,6 +45,7 @@ struct thread_stack_entry { u64 branch_count; struct call_path *cp; bool no_call; + bool trace_end; }; /** @@ -112,7 +114,8 @@ static struct thread_stack *thread_stack__new(struct thread *thread, return ts; } -static int thread_stack__push(struct thread_stack *ts, u64 ret_addr) +static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, + bool trace_end) { int err = 0; @@ -124,6 +127,7 @@ static int thread_stack__push(struct thread_stack *ts, u64 ret_addr) } } + ts->stack[ts->cnt].trace_end = trace_end; ts->stack[ts->cnt++].ret_addr = ret_addr; return err; @@ -150,6 +154,18 @@ static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr) } } +static void thread_stack__pop_trace_end(struct thread_stack *ts) +{ + size_t i; + + for (i = ts->cnt; i; ) { + if (ts->stack[--i].trace_end) + ts->cnt = i; + else + return; + } +} + static bool thread_stack__in_kernel(struct thread_stack *ts) { if (!ts->cnt) @@ -254,10 +270,19 @@ int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip, ret_addr = from_ip + insn_len; if (ret_addr == to_ip) return 0; /* Zero-length calls are excluded */ - return thread_stack__push(thread->ts, ret_addr); - } else if (flags & PERF_IP_FLAG_RETURN) { - if (!from_ip) - return 0; + return thread_stack__push(thread->ts, ret_addr, + flags & PERF_IP_FLAG_TRACE_END); + } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) { + /* + * If the caller did not change the trace number (which would + * have flushed the stack) then try to make sense of the stack. + * Possibly, tracing began after returning to the current + * address, so try to pop that. Also, do not expect a call made + * when the trace ended, to return, so pop that. + */ + thread_stack__pop(thread->ts, to_ip); + thread_stack__pop_trace_end(thread->ts); + } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) { thread_stack__pop(thread->ts, to_ip); } @@ -285,20 +310,46 @@ void thread_stack__free(struct thread *thread) } } +static inline u64 callchain_context(u64 ip, u64 kernel_start) +{ + return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; +} + void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, - size_t sz, u64 ip) + size_t sz, u64 ip, u64 kernel_start) { - size_t i; + u64 context = callchain_context(ip, kernel_start); + u64 last_context; + size_t i, j; - if (!thread || !thread->ts) - chain->nr = 1; - else - chain->nr = min(sz, thread->ts->cnt + 1); + if (sz < 2) { + chain->nr = 0; + return; + } + + chain->ips[0] = context; + chain->ips[1] = ip; - chain->ips[0] = ip; + if (!thread || !thread->ts) { + chain->nr = 2; + return; + } + + last_context = context; + + for (i = 2, j = 1; i < sz && j <= thread->ts->cnt; i++, j++) { + ip = thread->ts->stack[thread->ts->cnt - j].ret_addr; + context = callchain_context(ip, kernel_start); + if (context != last_context) { + if (i >= sz - 1) + break; + chain->ips[i++] = context; + last_context = context; + } + chain->ips[i] = ip; + } - for (i = 1; i < chain->nr; i++) - chain->ips[i] = thread->ts->stack[thread->ts->cnt - i].ret_addr; + chain->nr = i; } struct call_return_processor * @@ -332,7 +383,7 @@ void call_return_processor__free(struct call_return_processor *crp) static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, u64 timestamp, u64 ref, struct call_path *cp, - bool no_call) + bool no_call, bool trace_end) { struct thread_stack_entry *tse; int err; @@ -350,6 +401,7 @@ static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, tse->branch_count = ts->branch_count; tse->cp = cp; tse->no_call = no_call; + tse->trace_end = trace_end; return 0; } @@ -423,7 +475,7 @@ static int thread_stack__bottom(struct thread *thread, struct thread_stack *ts, return -ENOMEM; return thread_stack__push_cp(thread->ts, ip, sample->time, ref, cp, - true); + true, false); } static int thread_stack__no_call_return(struct thread *thread, @@ -455,7 +507,7 @@ static int thread_stack__no_call_return(struct thread *thread, if (!cp) return -ENOMEM; return thread_stack__push_cp(ts, 0, sample->time, ref, - cp, true); + cp, true, false); } } else if (thread_stack__in_kernel(ts) && sample->ip < ks) { /* Return to userspace, so pop all kernel addresses */ @@ -480,7 +532,7 @@ static int thread_stack__no_call_return(struct thread *thread, return -ENOMEM; err = thread_stack__push_cp(ts, sample->addr, sample->time, ref, cp, - true); + true, false); if (err) return err; @@ -500,7 +552,7 @@ static int thread_stack__trace_begin(struct thread *thread, /* Pop trace end */ tse = &ts->stack[ts->cnt - 1]; - if (tse->cp->sym == NULL && tse->cp->ip == 0) { + if (tse->trace_end) { err = thread_stack__call_return(thread, ts, --ts->cnt, timestamp, ref, false); if (err) @@ -529,7 +581,7 @@ static int thread_stack__trace_end(struct thread_stack *ts, ret_addr = sample->ip + sample->insn_len; return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp, - false); + false, true); } int thread_stack__process(struct thread *thread, struct comm *comm, @@ -579,6 +631,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm, ts->last_time = sample->time; if (sample->flags & PERF_IP_FLAG_CALL) { + bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END; struct call_path_root *cpr = ts->crp->cpr; struct call_path *cp; u64 ret_addr; @@ -596,7 +649,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm, if (!cp) return -ENOMEM; err = thread_stack__push_cp(ts, ret_addr, sample->time, ref, - cp, false); + cp, false, trace_end); } else if (sample->flags & PERF_IP_FLAG_RETURN) { if (!sample->ip || !sample->addr) return 0; diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index b7e41c4ebfdd..f97c00a8c251 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -84,7 +84,7 @@ int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip, u64 to_ip, u16 insn_len, u64 trace_nr); void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr); void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, - size_t sz, u64 ip); + size_t sz, u64 ip, u64 kernel_start); int thread_stack__flush(struct thread *thread); void thread_stack__free(struct thread *thread); size_t thread_stack__depth(struct thread *thread); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 2048d393ece6..3d9ed7d0e281 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -330,7 +330,8 @@ static int thread__prepare_access(struct thread *thread) } static int thread__clone_map_groups(struct thread *thread, - struct thread *parent) + struct thread *parent, + bool do_maps_clone) { /* This is new thread, we share map groups for process. */ if (thread->pid_ == parent->pid_) @@ -341,15 +342,11 @@ static int thread__clone_map_groups(struct thread *thread, thread->pid_, thread->tid, parent->pid_, parent->tid); return 0; } - /* But this one is new process, copy maps. */ - if (map_groups__clone(thread, parent->mg) < 0) - return -ENOMEM; - - return 0; + return do_maps_clone ? map_groups__clone(thread, parent->mg) : 0; } -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone) { if (parent->comm_set) { const char *comm = thread__comm_str(parent); @@ -362,7 +359,7 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) } thread->ppid = parent->tid; - return thread__clone_map_groups(thread, parent); + return thread__clone_map_groups(thread, parent, do_maps_clone); } void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 07606aa6998d..30e2b4c165fe 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -42,6 +42,8 @@ struct thread { void *addr_space; struct unwind_libunwind_ops *unwind_libunwind_ops; #endif + bool filter; + int filter_entry_depth; }; struct machine; @@ -87,7 +89,7 @@ struct comm *thread__comm(const struct thread *thread); struct comm *thread__exec_comm(const struct thread *thread); const char *thread__comm_str(const struct thread *thread); int thread__insert_map(struct thread *thread, struct map *map); -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp); +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone); size_t thread__fprintf(struct thread *thread, FILE *fp); struct thread *thread__main_thread(struct machine *machine, struct thread *thread); diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 183c91453522..56e4ca54020a 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -26,15 +26,12 @@ typedef int (*event_attr_op)(struct perf_tool *tool, union perf_event *event, struct perf_evlist **pevlist); -typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event, - struct perf_session *session); +typedef int (*event_op2)(struct perf_session *session, union perf_event *event); +typedef s64 (*event_op3)(struct perf_session *session, union perf_event *event); typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event, struct ordered_events *oe); -typedef s64 (*event_op3)(struct perf_tool *tool, union perf_event *event, - struct perf_session *session); - enum show_feature_header { SHOW_FEAT_NO_HEADER = 0, SHOW_FEAT_HEADER, diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 7b0ca7cbb7de..8ad8e755127b 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -531,12 +531,14 @@ struct tracing_data *tracing_data_get(struct list_head *pattrs, "/tmp/perf-XXXXXX"); if (!mkstemp(tdata->temp_file)) { pr_debug("Can't make temp file"); + free(tdata); return NULL; } temp_fd = open(tdata->temp_file, O_RDWR); if (temp_fd < 0) { pr_debug("Can't read '%s'", tdata->temp_file); + free(tdata); return NULL; } diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index e76214f8d596..32e558a65af3 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -33,14 +33,15 @@ static int get_common_field(struct scripting_context *context, int *offset, int *size, const char *type) { struct tep_handle *pevent = context->pevent; - struct event_format *event; - struct format_field *field; + struct tep_event_format *event; + struct tep_format_field *field; if (!*size) { - if (!pevent->events) + + event = tep_get_first_event(pevent); + if (!event) return 0; - event = pevent->events[0]; field = tep_find_common_field(event, type); if (!field) return 0; @@ -94,9 +95,9 @@ int common_pc(struct scripting_context *context) } unsigned long long -raw_field_value(struct event_format *event, const char *name, void *data) +raw_field_value(struct tep_event_format *event, const char *name, void *data) { - struct format_field *field; + struct tep_format_field *field; unsigned long long val; field = tep_find_any_field(event, name); @@ -108,12 +109,12 @@ raw_field_value(struct event_format *event, const char *name, void *data) return val; } -unsigned long long read_size(struct event_format *event, void *ptr, int size) +unsigned long long read_size(struct tep_event_format *event, void *ptr, int size) { return tep_read_number(event->pevent, ptr, size); } -void event_format__fprintf(struct event_format *event, +void event_format__fprintf(struct tep_event_format *event, int cpu, void *data, int size, FILE *fp) { struct tep_record record; @@ -130,7 +131,7 @@ void event_format__fprintf(struct event_format *event, trace_seq_destroy(&s); } -void event_format__print(struct event_format *event, +void event_format__print(struct tep_event_format *event, int cpu, void *data, int size) { return event_format__fprintf(event, cpu, data, size, stdout); @@ -158,6 +159,7 @@ void parse_ftrace_printk(struct tep_handle *pevent, printk = strdup(fmt+1); line = strtok_r(NULL, "\n", &next); tep_register_print_string(pevent, printk, addr); + free(printk); } } @@ -188,29 +190,33 @@ int parse_event_file(struct tep_handle *pevent, return tep_parse_event(pevent, buf, size, sys); } -struct event_format *trace_find_next_event(struct tep_handle *pevent, - struct event_format *event) +struct tep_event_format *trace_find_next_event(struct tep_handle *pevent, + struct tep_event_format *event) { static int idx; + int events_count; + struct tep_event_format *all_events; - if (!pevent || !pevent->events) + all_events = tep_get_first_event(pevent); + events_count = tep_get_events_count(pevent); + if (!pevent || !all_events || events_count < 1) return NULL; if (!event) { idx = 0; - return pevent->events[0]; + return all_events; } - if (idx < pevent->nr_events && event == pevent->events[idx]) { + if (idx < events_count && event == (all_events + idx)) { idx++; - if (idx == pevent->nr_events) + if (idx == events_count) return NULL; - return pevent->events[idx]; + return (all_events + idx); } - for (idx = 1; idx < pevent->nr_events; idx++) { - if (event == pevent->events[idx - 1]) - return pevent->events[idx]; + for (idx = 1; idx < events_count; idx++) { + if (event == (all_events + (idx - 1))) + return (all_events + idx); } return NULL; } diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 3dfc1db6b25b..76f12c705ef9 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -102,7 +102,7 @@ static unsigned int read4(struct tep_handle *pevent) if (do_read(&data, 4) < 0) return 0; - return __data2host4(pevent, data); + return __tep_data2host4(pevent, data); } static unsigned long long read8(struct tep_handle *pevent) @@ -111,7 +111,7 @@ static unsigned long long read8(struct tep_handle *pevent) if (do_read(&data, 8) < 0) return 0; - return __data2host8(pevent, data); + return __tep_data2host8(pevent, data); } static char *read_string(void) @@ -241,7 +241,7 @@ static int read_header_files(struct tep_handle *pevent) * The commit field in the page is of type long, * use that instead, since it represents the kernel. */ - tep_set_long_size(pevent, pevent->header_page_size_size); + tep_set_long_size(pevent, tep_get_header_page_size(pevent)); } free(header_page); @@ -297,10 +297,8 @@ static int read_event_file(struct tep_handle *pevent, char *sys, } ret = do_read(buf, size); - if (ret < 0) { - free(buf); + if (ret < 0) goto out; - } ret = parse_event_file(pevent, buf, size, sys); if (ret < 0) @@ -349,9 +347,12 @@ static int read_event_files(struct tep_handle *pevent) for (x=0; x < count; x++) { size = read8(pevent); ret = read_event_file(pevent, sys, size); - if (ret) + if (ret) { + free(sys); return ret; + } } + free(sys); } return 0; } diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c index 58bb72f266f3..95664b2f771e 100644 --- a/tools/perf/util/trace-event.c +++ b/tools/perf/util/trace-event.c @@ -72,12 +72,12 @@ void trace_event__cleanup(struct trace_event *t) /* * Returns pointer with encoded error via interface. */ -static struct event_format* +static struct tep_event_format* tp_format(const char *sys, const char *name) { char *tp_dir = get_events_file(sys); struct tep_handle *pevent = tevent.pevent; - struct event_format *event = NULL; + struct tep_event_format *event = NULL; char path[PATH_MAX]; size_t size; char *data; @@ -102,7 +102,7 @@ tp_format(const char *sys, const char *name) /* * Returns pointer with encoded error via interface. */ -struct event_format* +struct tep_event_format* trace_event__tp_format(const char *sys, const char *name) { if (!tevent_initialized && trace_event__init2()) @@ -111,7 +111,7 @@ trace_event__tp_format(const char *sys, const char *name) return tp_format(sys, name); } -struct event_format *trace_event__tp_format_id(int id) +struct tep_event_format *trace_event__tp_format_id(int id) { if (!tevent_initialized && trace_event__init2()) return ERR_PTR(-ENOMEM); diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index 40204ec3a7a2..f024d73bfc40 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -3,6 +3,7 @@ #define _PERF_UTIL_TRACE_EVENT_H #include +#include #include "parse-events.h" struct machine; @@ -10,28 +11,28 @@ struct perf_sample; union perf_event; struct perf_tool; struct thread; -struct plugin_list; +struct tep_plugin_list; struct trace_event { struct tep_handle *pevent; - struct plugin_list *plugin_list; + struct tep_plugin_list *plugin_list; }; int trace_event__init(struct trace_event *t); void trace_event__cleanup(struct trace_event *t); int trace_event__register_resolver(struct machine *machine, tep_func_resolver_t *func); -struct event_format* +struct tep_event_format* trace_event__tp_format(const char *sys, const char *name); -struct event_format *trace_event__tp_format_id(int id); +struct tep_event_format *trace_event__tp_format_id(int id); int bigendian(void); -void event_format__fprintf(struct event_format *event, +void event_format__fprintf(struct tep_event_format *event, int cpu, void *data, int size, FILE *fp); -void event_format__print(struct event_format *event, +void event_format__print(struct tep_event_format *event, int cpu, void *data, int size); int parse_ftrace_file(struct tep_handle *pevent, char *buf, unsigned long size); @@ -39,7 +40,7 @@ int parse_event_file(struct tep_handle *pevent, char *buf, unsigned long size, char *sys); unsigned long long -raw_field_value(struct event_format *event, const char *name, void *data); +raw_field_value(struct tep_event_format *event, const char *name, void *data); void parse_proc_kallsyms(struct tep_handle *pevent, char *file, unsigned int size); void parse_ftrace_printk(struct tep_handle *pevent, char *file, unsigned int size); @@ -47,9 +48,9 @@ void parse_saved_cmdline(struct tep_handle *pevent, char *file, unsigned int siz ssize_t trace_report(int fd, struct trace_event *tevent, bool repipe); -struct event_format *trace_find_next_event(struct tep_handle *pevent, - struct event_format *event); -unsigned long long read_size(struct event_format *event, void *ptr, int size); +struct tep_event_format *trace_find_next_event(struct tep_handle *pevent, + struct tep_event_format *event); +unsigned long long read_size(struct tep_event_format *event, void *ptr, int size); unsigned long long eval_flag(const char *flag); int read_tracing_data(int fd, struct list_head *pattrs); diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 6f318b15950e..5eff9bfc5758 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -45,13 +45,13 @@ static int __report_module(struct addr_location *al, u64 ip, Dwarf_Addr s; dwfl_module_info(mod, NULL, &s, NULL, NULL, NULL, NULL, NULL); - if (s != al->map->start) + if (s != al->map->start - al->map->pgoff) mod = 0; } if (!mod) mod = dwfl_report_elf(ui->dwfl, dso->short_name, - (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start, + (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start - al->map->pgoff, false); return mod && dwfl_addrmodule(ui->dwfl, ip) == mod ? 0 : -1; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index eac5b858a371..093352e93d50 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -221,7 +221,7 @@ out: return err; } -static int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) +int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) { void *ptr; loff_t pgoff; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index dc58254a2b69..14508ee7707a 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -6,6 +6,7 @@ /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */ #define _DEFAULT_SOURCE 1 +#include #include #include #include @@ -35,6 +36,7 @@ bool lsdir_no_dot_filter(const char *name, struct dirent *d); int copyfile(const char *from, const char *to); int copyfile_mode(const char *from, const char *to, mode_t mode); int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi); +int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size); ssize_t readn(int fd, void *buf, size_t n); ssize_t writen(int fd, const void *buf, size_t n); diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c index 9ba8a44ad2a7..84caee38418f 100644 --- a/tools/power/cpupower/bench/parse.c +++ b/tools/power/cpupower/bench/parse.c @@ -145,7 +145,7 @@ struct config *prepare_default_config() config->cpu = 0; config->prio = SCHED_HIGH; config->verbose = 0; - strncpy(config->governor, "ondemand", 8); + strncpy(config->governor, "ondemand", sizeof(config->governor)); config->output = stdout; diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index df43cd45d810..c3f39d5128ee 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -170,6 +170,7 @@ static int get_boost_mode(unsigned int cpu) unsigned long pstates[MAX_HW_PSTATES] = {0,}; if (cpupower_cpu_info.vendor != X86_VENDOR_AMD && + cpupower_cpu_info.vendor != X86_VENDOR_HYGON && cpupower_cpu_info.vendor != X86_VENDOR_INTEL) return 0; @@ -190,8 +191,9 @@ static int get_boost_mode(unsigned int cpu) printf(_(" Supported: %s\n"), support ? _("yes") : _("no")); printf(_(" Active: %s\n"), active ? _("yes") : _("no")); - if (cpupower_cpu_info.vendor == X86_VENDOR_AMD && - cpupower_cpu_info.family >= 0x10) { + if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD && + cpupower_cpu_info.family >= 0x10) || + cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { ret = decode_pstates(cpu, cpupower_cpu_info.family, b_states, pstates, &pstate_no); if (ret) @@ -200,6 +202,8 @@ static int get_boost_mode(unsigned int cpu) printf(_(" Boost States: %d\n"), b_states); printf(_(" Total States: %d\n"), pstate_no); for (i = 0; i < pstate_no; i++) { + if (!pstates[i]) + continue; if (i < b_states) printf(_(" Pstate-Pb%d: %luMHz (boost state)" "\n"), i, pstates[i]); diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index bb41cdd0df6b..7c4f83a8c973 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -33,7 +33,7 @@ union msr_pstate { unsigned vid:8; unsigned iddval:8; unsigned idddiv:2; - unsigned res1:30; + unsigned res1:31; unsigned en:1; } fam17h_bits; unsigned long long val; @@ -45,7 +45,7 @@ static int get_did(int family, union msr_pstate pstate) if (family == 0x12) t = pstate.val & 0xf; - else if (family == 0x17) + else if (family == 0x17 || family == 0x18) t = pstate.fam17h_bits.did; else t = pstate.bits.did; @@ -59,7 +59,7 @@ static int get_cof(int family, union msr_pstate pstate) int fid, did, cof; did = get_did(family, pstate); - if (family == 0x17) { + if (family == 0x17 || family == 0x18) { fid = pstate.fam17h_bits.fid; cof = 200 * fid / did; } else { @@ -119,6 +119,11 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, } if (read_msr(cpu, MSR_AMD_PSTATE + i, &pstate.val)) return -1; + if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en)) + continue; + else if (!pstate.bits.en) + continue; + pstates[i] = get_cof(cpu_family, pstate); } *no = i; diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index 732b0b41ba26..5cc39d4e23ed 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -8,7 +8,7 @@ #include "helpers/helpers.h" static const char *cpu_vendor_table[X86_VENDOR_MAX] = { - "Unknown", "GenuineIntel", "AuthenticAMD", + "Unknown", "GenuineIntel", "AuthenticAMD", "HygonGenuine", }; #if defined(__i386__) || defined(__x86_64__) @@ -109,6 +109,7 @@ out: fclose(fp); /* Get some useful CPU capabilities from cpuid */ if (cpu_info->vendor != X86_VENDOR_AMD && + cpu_info->vendor != X86_VENDOR_HYGON && cpu_info->vendor != X86_VENDOR_INTEL) return ret; @@ -124,8 +125,9 @@ out: if (cpuid_level >= 6 && (cpuid_ecx(6) & 0x1)) cpu_info->caps |= CPUPOWER_CAP_APERF; - /* AMD Boost state enable/disable register */ - if (cpu_info->vendor == X86_VENDOR_AMD) { + /* AMD or Hygon Boost state enable/disable register */ + if (cpu_info->vendor == X86_VENDOR_AMD || + cpu_info->vendor == X86_VENDOR_HYGON) { if (ext_cpuid_level >= 0x80000007 && (cpuid_edx(0x80000007) & (1 << 9))) cpu_info->caps |= CPUPOWER_CAP_AMD_CBP; diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 41da392be448..902139689315 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -61,7 +61,7 @@ extern int be_verbose; /* cpuid and cpuinfo helpers **************************/ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, - X86_VENDOR_AMD, X86_VENDOR_MAX}; + X86_VENDOR_AMD, X86_VENDOR_HYGON, X86_VENDOR_MAX}; #define CPUPOWER_CAP_INV_TSC 0x00000001 #define CPUPOWER_CAP_APERF 0x00000002 diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 80fdf55f414d..f406adc40bad 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -26,7 +26,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, * has Hardware determined variable increments instead. */ - if (cpu_info.family == 0x17) { + if (cpu_info.family == 0x17 || cpu_info.family == 0x18) { if (!read_msr(cpu, MSR_AMD_HWCR, &val)) { if (!(val & CPUPOWER_AMD_CPBDIS)) *active = 1; diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c index d7c2a6d13dea..f2a7e9cfd577 100644 --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c @@ -241,7 +241,8 @@ static int init_maxfreq_mode(void) if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) goto use_sysfs; - if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) { + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD || + cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf * freq. * A test whether hwcr is accessable/available would be: diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile index c1899cd72c80..845541544570 100644 --- a/tools/power/pm-graph/Makefile +++ b/tools/power/pm-graph/Makefile @@ -23,8 +23,8 @@ install : uninstall install -m 644 config/suspend-x2-proc.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config install -d $(DESTDIR)$(PREFIX)/bin - ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph - ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph + ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph + ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph install -d $(DESTDIR)$(PREFIX)/share/man/man8 install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 diff --git a/tools/power/pm-graph/bootgraph.py b/tools/power/pm-graph/bootgraph.py index 8ee626c0f6a5..6dae57041537 100755 --- a/tools/power/pm-graph/bootgraph.py +++ b/tools/power/pm-graph/bootgraph.py @@ -34,6 +34,10 @@ from datetime import datetime, timedelta from subprocess import call, Popen, PIPE import sleepgraph as aslib +def pprint(msg): + print(msg) + sys.stdout.flush() + # ----------------- CLASSES -------------------- # Class: SystemValues @@ -157,11 +161,11 @@ class SystemValues(aslib.SystemValues): return cmdline def manualRebootRequired(self): cmdline = self.kernelParams() - print 'To generate a new timeline manually, follow these steps:\n' - print '1. Add the CMDLINE string to your kernel command line.' - print '2. Reboot the system.' - print '3. After reboot, re-run this tool with the same arguments but no command (w/o -reboot or -manual).\n' - print 'CMDLINE="%s"' % cmdline + pprint('To generate a new timeline manually, follow these steps:\n\n'\ + '1. Add the CMDLINE string to your kernel command line.\n'\ + '2. Reboot the system.\n'\ + '3. After reboot, re-run this tool with the same arguments but no command (w/o -reboot or -manual).\n\n'\ + 'CMDLINE="%s"' % cmdline) sys.exit() def blGrub(self): blcmd = '' @@ -431,7 +435,7 @@ def parseTraceLog(data): if len(cg.list) < 1 or cg.invalid or (cg.end - cg.start == 0): continue if(not cg.postProcess()): - print('Sanity check failed for %s-%d' % (proc, pid)) + pprint('Sanity check failed for %s-%d' % (proc, pid)) continue # match cg data to devices devname = data.deviceMatch(pid, cg) @@ -442,8 +446,8 @@ def parseTraceLog(data): sysvals.vprint('%s callgraph found for %s %s-%d [%f - %f]' %\ (kind, cg.name, proc, pid, cg.start, cg.end)) elif len(cg.list) > 1000000: - print 'WARNING: the callgraph found for %s is massive! (%d lines)' %\ - (devname, len(cg.list)) + pprint('WARNING: the callgraph found for %s is massive! (%d lines)' %\ + (devname, len(cg.list))) # Function: retrieveLogs # Description: @@ -528,7 +532,7 @@ def createBootGraph(data): tMax = data.end tTotal = tMax - t0 if(tTotal == 0): - print('ERROR: No timeline data') + pprint('ERROR: No timeline data') return False user_mode = '%.0f'%(data.tUserMode*1000) last_init = '%.0f'%(tTotal*1000) @@ -734,7 +738,7 @@ def updateCron(restore=False): op.close() res = call([cmd, cronfile]) except Exception, e: - print 'Exception: %s' % str(e) + pprint('Exception: %s' % str(e)) shutil.move(backfile, cronfile) res = -1 if res != 0: @@ -750,7 +754,7 @@ def updateGrub(restore=False): call(sysvals.blexec, stderr=PIPE, stdout=PIPE, env={'PATH': '.:/sbin:/usr/sbin:/usr/bin:/sbin:/bin'}) except Exception, e: - print 'Exception: %s\n' % str(e) + pprint('Exception: %s\n' % str(e)) return # extract the option and create a grub config without it sysvals.rootUser(True) @@ -797,7 +801,7 @@ def updateGrub(restore=False): res = call(sysvals.blexec) os.remove(grubfile) except Exception, e: - print 'Exception: %s' % str(e) + pprint('Exception: %s' % str(e)) res = -1 # cleanup shutil.move(tempfile, grubfile) @@ -821,7 +825,7 @@ def updateKernelParams(restore=False): def doError(msg, help=False): if help == True: printHelp() - print 'ERROR: %s\n' % msg + pprint('ERROR: %s\n' % msg) sysvals.outputResult({'error':msg}) sys.exit() @@ -829,52 +833,51 @@ def doError(msg, help=False): # Description: # print out the help text def printHelp(): - print('') - print('%s v%s' % (sysvals.title, sysvals.version)) - print('Usage: bootgraph ') - print('') - print('Description:') - print(' This tool reads in a dmesg log of linux kernel boot and') - print(' creates an html representation of the boot timeline up to') - print(' the start of the init process.') - print('') - print(' If no specific command is given the tool reads the current dmesg') - print(' and/or ftrace log and creates a timeline') - print('') - print(' Generates output files in subdirectory: boot-yymmdd-HHMMSS') - print(' HTML output: _boot.html') - print(' raw dmesg output: _boot_dmesg.txt') - print(' raw ftrace output: _boot_ftrace.txt') - print('') - print('Options:') - print(' -h Print this help text') - print(' -v Print the current tool version') - print(' -verbose Print extra information during execution and analysis') - print(' -addlogs Add the dmesg log to the html output') - print(' -result fn Export a results table to a text file for parsing.') - print(' -o name Overrides the output subdirectory name when running a new test') - print(' default: boot-{date}-{time}') - print(' [advanced]') - print(' -fstat Use ftrace to add function detail and statistics (default: disabled)') - print(' -f/-callgraph Add callgraph detail, can be very large (default: disabled)') - print(' -maxdepth N limit the callgraph data to N call levels (default: 2)') - print(' -mincg ms Discard all callgraphs shorter than ms milliseconds (e.g. 0.001 for us)') - print(' -timeprec N Number of significant digits in timestamps (0:S, 3:ms, [6:us])') - print(' -expandcg pre-expand the callgraph data in the html output (default: disabled)') - print(' -func list Limit ftrace to comma-delimited list of functions (default: do_one_initcall)') - print(' -cgfilter S Filter the callgraph output in the timeline') - print(' -cgskip file Callgraph functions to skip, off to disable (default: cgskip.txt)') - print(' -bl name Use the following boot loader for kernel params (default: grub)') - print(' -reboot Reboot the machine automatically and generate a new timeline') - print(' -manual Show the steps to generate a new timeline manually (used with -reboot)') - print('') - print('Other commands:') - print(' -flistall Print all functions capable of being captured in ftrace') - print(' -sysinfo Print out system info extracted from BIOS') - print(' [redo]') - print(' -dmesg file Create HTML output using dmesg input (used with -ftrace)') - print(' -ftrace file Create HTML output using ftrace input (used with -dmesg)') - print('') + pprint('\n%s v%s\n'\ + 'Usage: bootgraph \n'\ + '\n'\ + 'Description:\n'\ + ' This tool reads in a dmesg log of linux kernel boot and\n'\ + ' creates an html representation of the boot timeline up to\n'\ + ' the start of the init process.\n'\ + '\n'\ + ' If no specific command is given the tool reads the current dmesg\n'\ + ' and/or ftrace log and creates a timeline\n'\ + '\n'\ + ' Generates output files in subdirectory: boot-yymmdd-HHMMSS\n'\ + ' HTML output: _boot.html\n'\ + ' raw dmesg output: _boot_dmesg.txt\n'\ + ' raw ftrace output: _boot_ftrace.txt\n'\ + '\n'\ + 'Options:\n'\ + ' -h Print this help text\n'\ + ' -v Print the current tool version\n'\ + ' -verbose Print extra information during execution and analysis\n'\ + ' -addlogs Add the dmesg log to the html output\n'\ + ' -result fn Export a results table to a text file for parsing.\n'\ + ' -o name Overrides the output subdirectory name when running a new test\n'\ + ' default: boot-{date}-{time}\n'\ + ' [advanced]\n'\ + ' -fstat Use ftrace to add function detail and statistics (default: disabled)\n'\ + ' -f/-callgraph Add callgraph detail, can be very large (default: disabled)\n'\ + ' -maxdepth N limit the callgraph data to N call levels (default: 2)\n'\ + ' -mincg ms Discard all callgraphs shorter than ms milliseconds (e.g. 0.001 for us)\n'\ + ' -timeprec N Number of significant digits in timestamps (0:S, 3:ms, [6:us])\n'\ + ' -expandcg pre-expand the callgraph data in the html output (default: disabled)\n'\ + ' -func list Limit ftrace to comma-delimited list of functions (default: do_one_initcall)\n'\ + ' -cgfilter S Filter the callgraph output in the timeline\n'\ + ' -cgskip file Callgraph functions to skip, off to disable (default: cgskip.txt)\n'\ + ' -bl name Use the following boot loader for kernel params (default: grub)\n'\ + ' -reboot Reboot the machine automatically and generate a new timeline\n'\ + ' -manual Show the steps to generate a new timeline manually (used with -reboot)\n'\ + '\n'\ + 'Other commands:\n'\ + ' -flistall Print all functions capable of being captured in ftrace\n'\ + ' -sysinfo Print out system info extracted from BIOS\n'\ + ' [redo]\n'\ + ' -dmesg file Create HTML output using dmesg input (used with -ftrace)\n'\ + ' -ftrace file Create HTML output using ftrace input (used with -dmesg)\n'\ + '' % (sysvals.title, sysvals.version)) return True # ----------------- MAIN -------------------- @@ -895,7 +898,7 @@ if __name__ == '__main__': printHelp() sys.exit() elif(arg == '-v'): - print("Version %s" % sysvals.version) + pprint("Version %s" % sysvals.version) sys.exit() elif(arg == '-verbose'): sysvals.verbose = True @@ -1016,7 +1019,7 @@ if __name__ == '__main__': print f elif cmd == 'checkbl': sysvals.getBootLoader() - print 'Boot Loader: %s\n%s' % (sysvals.bootloader, sysvals.blexec) + pprint('Boot Loader: %s\n%s' % (sysvals.bootloader, sysvals.blexec)) elif(cmd == 'sysinfo'): sysvals.printSystemInfo(True) sys.exit() diff --git a/tools/power/pm-graph/config/cgskip.txt b/tools/power/pm-graph/config/cgskip.txt index e48d588fbfb4..9ff88e7e2300 100644 --- a/tools/power/pm-graph/config/cgskip.txt +++ b/tools/power/pm-graph/config/cgskip.txt @@ -27,6 +27,7 @@ ktime_get # console calls printk dev_printk +__dev_printk console_unlock # memory handling diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg index f8fcb06fd68b..4f80ad7d7275 100644 --- a/tools/power/pm-graph/config/custom-timeline-functions.cfg +++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg @@ -105,7 +105,7 @@ override-dev-timeline-functions: true # example: [color=#CC00CC] # # arglist: A list of arguments from registers/stack addresses. See URL: -# https://www.kernel.org/doc/Documentation/trace/kprobetrace.rst +# https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt # # example: cpu=%di:s32 # @@ -170,7 +170,7 @@ pm_restore_console: # example: [color=#CC00CC] # # arglist: A list of arguments from registers/stack addresses. See URL: -# https://www.kernel.org/doc/Documentation/trace/kprobetrace.rst +# https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt # # example: port=+36(%di):s32 # diff --git a/tools/power/pm-graph/sleepgraph.8 b/tools/power/pm-graph/sleepgraph.8 index 070be2cf7f74..24a2e7d0ae63 100644 --- a/tools/power/pm-graph/sleepgraph.8 +++ b/tools/power/pm-graph/sleepgraph.8 @@ -65,9 +65,9 @@ During test, enable/disable runtime suspend for all devices. The test is delayed by 5 seconds to allow runtime suspend changes to occur. The settings are restored after the test is complete. .TP -\fB-display \fIon/off\fR -Turn the display on or off for the test using the xset command. This helps -maintain the consistecy of test data for better comparison. +\fB-display \fIon/off/standby/suspend\fR +Switch the display to the requested mode for the test using the xset command. +This helps maintain the consistency of test data for better comparison. .TP \fB-skiphtml\fR Run the test and capture the trace logs, but skip the timeline generation. @@ -183,6 +183,13 @@ Print out the contents of the ACPI Firmware Performance Data Table. \fB-battery\fR Print out battery status and current charge. .TP +\fB-xon/-xoff/-xstandby/-xsuspend\fR +Test xset by attempting to switch the display to the given mode. This +is the same command which will be issued by \fB-display \fImode\fR. +.TP +\fB-xstat\fR +Get the current DPMS display mode. +.TP \fB-sysinfo\fR Print out system info extracted from BIOS. Reads /dev/mem directly instead of going through dmidecode. .TP diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 0c760478f7d7..52618f3444d4 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -54,6 +54,7 @@ import os import string import re import platform +import signal from datetime import datetime import struct import ConfigParser @@ -61,6 +62,10 @@ import gzip from threading import Thread from subprocess import call, Popen, PIPE +def pprint(msg): + print(msg) + sys.stdout.flush() + # ----------------- CLASSES -------------------- # Class: SystemValues @@ -69,10 +74,10 @@ from subprocess import call, Popen, PIPE # store system values and test parameters class SystemValues: title = 'SleepGraph' - version = '5.1' + version = '5.2' ansi = False rs = 0 - display = 0 + display = '' gzip = False sync = False verbose = False @@ -99,6 +104,7 @@ class SystemValues: tpath = '/sys/kernel/debug/tracing/' fpdtpath = '/sys/firmware/acpi/tables/FPDT' epath = '/sys/kernel/debug/tracing/events/power/' + pmdpath = '/sys/power/pm_debug_messages' traceevents = [ 'suspend_resume', 'device_pm_callback_end', @@ -109,8 +115,10 @@ class SystemValues: mempath = '/dev/mem' powerfile = '/sys/power/state' mempowerfile = '/sys/power/mem_sleep' + diskpowerfile = '/sys/power/disk' suspendmode = 'mem' memmode = '' + diskmode = '' hostname = 'localhost' prefix = 'test' teststamp = '' @@ -137,16 +145,15 @@ class SystemValues: useprocmon = False notestrun = False cgdump = False + devdump = False mixedphaseheight = True devprops = dict() predelay = 0 postdelay = 0 - procexecfmt = 'ps - (?P.*)$' - devpropfmt = '# Device Properties: .*' - tracertypefmt = '# tracer: (?P.*)' - firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' + pmdebug = '' tracefuncs = { 'sys_sync': {}, + 'ksys_sync': {}, '__pm_notifier_call_chain': {}, 'pm_prepare_console': {}, 'pm_notifier_call_chain': {}, @@ -187,7 +194,6 @@ class SystemValues: dev_tracefuncs = { # general wait/delay/sleep 'msleep': { 'args_x86_64': {'time':'%di:s32'}, 'ub': 1 }, - 'schedule_timeout_uninterruptible': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, 'schedule_timeout': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, 'udelay': { 'func':'__const_udelay', 'args_x86_64': {'loops':'%di:s32'}, 'ub': 1 }, 'usleep_range': { 'args_x86_64': {'min':'%di:s32', 'max':'%si:s32'}, 'ub': 1 }, @@ -199,6 +205,9 @@ class SystemValues: # filesystem 'ext4_sync_fs': {}, # 80211 + 'ath10k_bmi_read_memory': { 'args_x86_64': {'length':'%cx:s32'} }, + 'ath10k_bmi_write_memory': { 'args_x86_64': {'length':'%cx:s32'} }, + 'ath10k_bmi_fast_download': { 'args_x86_64': {'length':'%cx:s32'} }, 'iwlagn_mac_start': {}, 'iwlagn_alloc_bcast_station': {}, 'iwl_trans_pcie_start_hw': {}, @@ -241,6 +250,7 @@ class SystemValues: timeformat = '%.3f' cmdline = '%s %s' % \ (os.path.basename(sys.argv[0]), ' '.join(sys.argv[1:])) + sudouser = '' def __init__(self): self.archargs = 'args_'+platform.machine() self.hostname = platform.node() @@ -256,27 +266,49 @@ class SystemValues: if (hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()): self.ansi = True self.testdir = datetime.now().strftime('suspend-%y%m%d-%H%M%S') + if os.getuid() == 0 and 'SUDO_USER' in os.environ and \ + os.environ['SUDO_USER']: + self.sudouser = os.environ['SUDO_USER'] def vprint(self, msg): self.logmsg += msg+'\n' - if(self.verbose): - print(msg) + if self.verbose or msg.startswith('WARNING:'): + pprint(msg) + def signalHandler(self, signum, frame): + if not self.result: + return + signame = self.signames[signum] if signum in self.signames else 'UNKNOWN' + msg = 'Signal %s caused a tool exit, line %d' % (signame, frame.f_lineno) + sysvals.outputResult({'error':msg}) + sys.exit(3) + def signalHandlerInit(self): + capture = ['BUS', 'SYS', 'XCPU', 'XFSZ', 'PWR', 'HUP', 'INT', 'QUIT', + 'ILL', 'ABRT', 'FPE', 'SEGV', 'TERM', 'TSTP'] + self.signames = dict() + for i in capture: + s = 'SIG'+i + try: + signum = getattr(signal, s) + signal.signal(signum, self.signalHandler) + except: + continue + self.signames[signum] = s def rootCheck(self, fatal=True): if(os.access(self.powerfile, os.W_OK)): return True if fatal: msg = 'This command requires sysfs mount and root access' - print('ERROR: %s\n') % msg + pprint('ERROR: %s\n' % msg) self.outputResult({'error':msg}) - sys.exit() + sys.exit(1) return False def rootUser(self, fatal=False): if 'USER' in os.environ and os.environ['USER'] == 'root': return True if fatal: msg = 'This command must be run as root' - print('ERROR: %s\n') % msg + pprint('ERROR: %s\n' % msg) self.outputResult({'error':msg}) - sys.exit() + sys.exit(1) return False def getExec(self, cmd): dirlist = ['/sbin', '/bin', '/usr/sbin', '/usr/bin', @@ -406,8 +438,8 @@ class SystemValues: ktime = m.group('ktime') fp.close() self.dmesgstart = float(ktime) - def getdmesg(self, fwdata=[]): - op = self.writeDatafileHeader(sysvals.dmesgfile, fwdata) + def getdmesg(self, testdata): + op = self.writeDatafileHeader(sysvals.dmesgfile, testdata) # store all new dmesg lines since initdmesg was called fp = Popen('dmesg', stdout=PIPE).stdout for line in fp: @@ -535,7 +567,7 @@ class SystemValues: if len(self.kprobes) < 1: return if output: - print(' kprobe functions in this kernel:') + pprint(' kprobe functions in this kernel:') # first test each kprobe rejects = [] # sort kprobes: trace, ub-dev, custom, dev @@ -557,7 +589,7 @@ class SystemValues: else: kpl[2].append(name) if output: - print(' %s: %s' % (name, res)) + pprint(' %s: %s' % (name, res)) kplist = kpl[0] + kpl[1] + kpl[2] + kpl[3] # remove all failed ones from the list for name in rejects: @@ -571,7 +603,7 @@ class SystemValues: if output: check = self.fgetVal('kprobe_events') linesack = (len(check.split('\n')) - 1) / 2 - print(' kprobe functions enabled: %d/%d' % (linesack, linesout)) + pprint(' kprobe functions enabled: %d/%d' % (linesack, linesout)) self.fsetVal('1', 'events/kprobes/enable') def testKprobe(self, kname, kprobe): self.fsetVal('0', 'events/kprobes/enable') @@ -619,6 +651,8 @@ class SystemValues: self.fsetVal('0', 'events/kprobes/enable') self.fsetVal('', 'kprobe_events') self.fsetVal('1024', 'buffer_size_kb') + if self.pmdebug: + self.setVal(self.pmdebug, self.pmdpath) def setupAllKprobes(self): for name in self.tracefuncs: self.defaultKprobe(name, self.tracefuncs[name]) @@ -637,10 +671,15 @@ class SystemValues: return False def initFtrace(self): self.printSystemInfo(False) - print('INITIALIZING FTRACE...') + pprint('INITIALIZING FTRACE...') # turn trace off self.fsetVal('0', 'tracing_on') self.cleanupFtrace() + # pm debug messages + pv = self.getVal(self.pmdpath) + if pv != '1': + self.setVal('1', self.pmdpath) + self.pmdebug = pv # set the trace clock to global self.fsetVal('global', 'trace_clock') self.fsetVal('nop', 'current_tracer') @@ -649,7 +688,8 @@ class SystemValues: if self.bufsize > 0: tgtsize = self.bufsize elif self.usecallgraph or self.usedevsrc: - tgtsize = min(self.memfree, 3*1024*1024) + bmax = (1*1024*1024) if self.suspendmode == 'disk' else (3*1024*1024) + tgtsize = min(self.memfree, bmax) else: tgtsize = 65536 while not self.fsetVal('%d' % (tgtsize / cpus), 'buffer_size_kb'): @@ -658,7 +698,7 @@ class SystemValues: if tgtsize < 65536: tgtsize = int(self.fgetVal('buffer_size_kb')) * cpus break - print 'Setting trace buffers to %d kB (%d kB per cpu)' % (tgtsize, tgtsize/cpus) + pprint('Setting trace buffers to %d kB (%d kB per cpu)' % (tgtsize, tgtsize/cpus)) # initialize the callgraph trace if(self.usecallgraph): # set trace type @@ -691,7 +731,7 @@ class SystemValues: if self.usedevsrc: for name in self.dev_tracefuncs: self.defaultKprobe(name, self.dev_tracefuncs[name]) - print('INITIALIZING KPROBES...') + pprint('INITIALIZING KPROBES...') self.addKprobes(self.verbose) if(self.usetraceevents): # turn trace events on @@ -728,19 +768,24 @@ class SystemValues: if not self.ansi: return str return '\x1B[%d;40m%s\x1B[m' % (color, str) - def writeDatafileHeader(self, filename, fwdata=[]): + def writeDatafileHeader(self, filename, testdata): fp = self.openlog(filename, 'w') fp.write('%s\n%s\n# command | %s\n' % (self.teststamp, self.sysstamp, self.cmdline)) - if(self.suspendmode == 'mem' or self.suspendmode == 'command'): - for fw in fwdata: + for test in testdata: + if 'fw' in test: + fw = test['fw'] if(fw): fp.write('# fwsuspend %u fwresume %u\n' % (fw[0], fw[1])) + if 'bat' in test: + (a1, c1), (a2, c2) = test['bat'] + fp.write('# battery %s %d %s %d\n' % (a1, c1, a2, c2)) + if test['error'] or len(testdata) > 1: + fp.write('# enter_sleep_error %s\n' % test['error']) return fp - def sudouser(self, dir): - if os.path.exists(dir) and os.getuid() == 0 and \ - 'SUDO_USER' in os.environ: + def sudoUserchown(self, dir): + if os.path.exists(dir) and self.sudouser: cmd = 'chown -R {0}:{0} {1} > /dev/null 2>&1' - call(cmd.format(os.environ['SUDO_USER'], dir), shell=True) + call(cmd.format(self.sudouser, dir), shell=True) def outputResult(self, testdata, num=0): if not self.result: return @@ -762,7 +807,7 @@ class SystemValues: if 'bugurl' in testdata: fp.write('url%s: %s\n' % (n, testdata['bugurl'])) fp.close() - self.sudouser(self.result) + self.sudoUserchown(self.result) def configFile(self, file): dir = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(file): @@ -800,15 +845,16 @@ suspendmodename = { # Simple class which holds property values collected # for all the devices used in the timeline. class DevProps: - syspath = '' - altname = '' - async = True - xtraclass = '' - xtrainfo = '' + def __init__(self): + self.syspath = '' + self.altname = '' + self.async = True + self.xtraclass = '' + self.xtrainfo = '' def out(self, dev): return '%s,%s,%d;' % (dev, self.altname, self.async) def debug(self, dev): - print '%s:\n\taltname = %s\n\t async = %s' % (dev, self.altname, self.async) + pprint('%s:\n\taltname = %s\n\t async = %s' % (dev, self.altname, self.async)) def altName(self, dev): if not self.altname or self.altname == dev: return dev @@ -831,9 +877,6 @@ class DevProps: # A container used to create a device hierachy, with a single root node # and a tree of child nodes. Used by Data.deviceTopology() class DeviceNode: - name = '' - children = 0 - depth = 0 def __init__(self, nodename, nodedepth): self.name = nodename self.children = [] @@ -861,71 +904,78 @@ class DeviceNode: # } # class Data: - dmesg = {} # root data structure - phases = [] # ordered list of phases - start = 0.0 # test start - end = 0.0 # test end - tSuspended = 0.0 # low-level suspend start - tResumed = 0.0 # low-level resume start - tKernSus = 0.0 # kernel level suspend start - tKernRes = 0.0 # kernel level resume end - tLow = 0.0 # time spent in low-level suspend (standby/freeze) - fwValid = False # is firmware data available - fwSuspend = 0 # time spent in firmware suspend - fwResume = 0 # time spent in firmware resume - dmesgtext = [] # dmesg text file in memory - pstl = 0 # process timeline - testnumber = 0 - idstr = '' - html_device_id = 0 - stamp = 0 - outfile = '' - devpids = [] - kerror = False + phasedef = { + 'suspend_prepare': {'order': 0, 'color': '#CCFFCC'}, + 'suspend': {'order': 1, 'color': '#88FF88'}, + 'suspend_late': {'order': 2, 'color': '#00AA00'}, + 'suspend_noirq': {'order': 3, 'color': '#008888'}, + 'suspend_machine': {'order': 4, 'color': '#0000FF'}, + 'resume_machine': {'order': 5, 'color': '#FF0000'}, + 'resume_noirq': {'order': 6, 'color': '#FF9900'}, + 'resume_early': {'order': 7, 'color': '#FFCC00'}, + 'resume': {'order': 8, 'color': '#FFFF88'}, + 'resume_complete': {'order': 9, 'color': '#FFFFCC'}, + } + errlist = { + 'HWERROR' : '.*\[ *Hardware Error *\].*', + 'FWBUG' : '.*\[ *Firmware Bug *\].*', + 'BUG' : '.*BUG.*', + 'ERROR' : '.*ERROR.*', + 'WARNING' : '.*WARNING.*', + 'IRQ' : '.*genirq: .*', + 'TASKFAIL': '.*Freezing of tasks failed.*', + } def __init__(self, num): idchar = 'abcdefghij' - self.pstl = dict() + self.start = 0.0 # test start + self.end = 0.0 # test end + self.tSuspended = 0.0 # low-level suspend start + self.tResumed = 0.0 # low-level resume start + self.tKernSus = 0.0 # kernel level suspend start + self.tKernRes = 0.0 # kernel level resume end + self.fwValid = False # is firmware data available + self.fwSuspend = 0 # time spent in firmware suspend + self.fwResume = 0 # time spent in firmware resume + self.html_device_id = 0 + self.stamp = 0 + self.outfile = '' + self.kerror = False + self.battery = 0 + self.enterfail = '' + self.currphase = '' + self.pstl = dict() # process timeline self.testnumber = num self.idstr = idchar[num] - self.dmesgtext = [] - self.phases = [] - self.dmesg = { # fixed list of 10 phases - 'suspend_prepare': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#CCFFCC', 'order': 0}, - 'suspend': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#88FF88', 'order': 1}, - 'suspend_late': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#00AA00', 'order': 2}, - 'suspend_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#008888', 'order': 3}, - 'suspend_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#0000FF', 'order': 4}, - 'resume_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF0000', 'order': 5}, - 'resume_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF9900', 'order': 6}, - 'resume_early': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFCC00', 'order': 7}, - 'resume': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFF88', 'order': 8}, - 'resume_complete': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFFCC', 'order': 9} - } - self.phases = self.sortedPhases() + self.dmesgtext = [] # dmesg text file in memory + self.dmesg = dict() # root data structure + self.errorinfo = {'suspend':[],'resume':[]} + self.tLow = [] # time spent in low-level suspends (standby/freeze) + self.devpids = [] + self.devicegroups = 0 + def sortedPhases(self): + return sorted(self.dmesg, key=lambda k:self.dmesg[k]['order']) + def initDevicegroups(self): + # called when phases are all finished being added + for phase in self.dmesg.keys(): + if '*' in phase: + p = phase.split('*') + pnew = '%s%d' % (p[0], len(p)) + self.dmesg[pnew] = self.dmesg.pop(phase) self.devicegroups = [] - for phase in self.phases: + for phase in self.sortedPhases(): self.devicegroups.append([phase]) - self.errorinfo = {'suspend':[],'resume':[]} + def nextPhase(self, phase, offset): + order = self.dmesg[phase]['order'] + offset + for p in self.dmesg: + if self.dmesg[p]['order'] == order: + return p + return '' + def lastPhase(self): + plist = self.sortedPhases() + if len(plist) < 1: + return '' + return plist[-1] def extractErrorInfo(self): - elist = { - 'HWERROR' : '.*\[ *Hardware Error *\].*', - 'FWBUG' : '.*\[ *Firmware Bug *\].*', - 'BUG' : '.*BUG.*', - 'ERROR' : '.*ERROR.*', - 'WARNING' : '.*WARNING.*', - 'IRQ' : '.*genirq: .*', - 'TASKFAIL': '.*Freezing of tasks failed.*', - } lf = sysvals.openlog(sysvals.dmesgfile, 'r') i = 0 list = [] @@ -939,8 +989,8 @@ class Data: continue dir = 'suspend' if t < self.tSuspended else 'resume' msg = m.group('msg') - for err in elist: - if re.match(elist[err], msg): + for err in self.errlist: + if re.match(self.errlist[err], msg): list.append((err, dir, t, i, i)) self.kerror = True break @@ -956,7 +1006,7 @@ class Data: def setEnd(self, time): self.end = time def isTraceEventOutsideDeviceCalls(self, pid, time): - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for dev in list: d = list[dev] @@ -964,16 +1014,10 @@ class Data: time < d['end']): return False return True - def phaseCollision(self, phase, isbegin, line): - key = 'end' - if isbegin: - key = 'start' - if self.dmesg[phase][key] >= 0: - sysvals.vprint('IGNORE: %s' % line.strip()) - return True - return False def sourcePhase(self, start): - for phase in self.phases: + for phase in self.sortedPhases(): + if 'machine' in phase: + continue pend = self.dmesg[phase]['end'] if start <= pend: return phase @@ -1004,14 +1048,15 @@ class Data: return tgtdev def addDeviceFunctionCall(self, displayname, kprobename, proc, pid, start, end, cdata, rdata): # try to place the call in a device - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'device') + phases = self.sortedPhases() + tgtdev = self.sourceDevice(phases, start, end, pid, 'device') # calls with device pids that occur outside device bounds are dropped # TODO: include these somehow if not tgtdev and pid in self.devpids: return False # try to place the call in a thread if not tgtdev: - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'thread') + tgtdev = self.sourceDevice(phases, start, end, pid, 'thread') # create new thread blocks, expand as new calls are found if not tgtdev: if proc == '<...>': @@ -1053,7 +1098,7 @@ class Data: def overflowDevices(self): # get a list of devices that extend beyond the end of this test run devlist = [] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for devname in list: dev = list[devname] @@ -1064,7 +1109,7 @@ class Data: # merge any devices that overlap devlist for dev in devlist: devname = dev['name'] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if devname not in list: continue @@ -1079,7 +1124,7 @@ class Data: del list[devname] def usurpTouchingThread(self, name, dev): # the caller test has priority of this thread, give it to him - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if name in list: tdev = list[name] @@ -1093,7 +1138,7 @@ class Data: break def stitchTouchingThreads(self, testlist): # merge any threads between tests that touch - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for devname in list: dev = list[devname] @@ -1103,7 +1148,7 @@ class Data: data.usurpTouchingThread(devname, dev) def optimizeDevSrc(self): # merge any src call loops to reduce timeline size - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for dev in list: if 'src' not in list[dev]: @@ -1141,7 +1186,7 @@ class Data: self.tKernSus = self.trimTimeVal(self.tKernSus, t0, dT, left) self.tKernRes = self.trimTimeVal(self.tKernRes, t0, dT, left) self.end = self.trimTimeVal(self.end, t0, dT, left) - for phase in self.phases: + for phase in self.sortedPhases(): p = self.dmesg[phase] p['start'] = self.trimTimeVal(p['start'], t0, dT, left) p['end'] = self.trimTimeVal(p['end'], t0, dT, left) @@ -1150,6 +1195,7 @@ class Data: d = list[name] d['start'] = self.trimTimeVal(d['start'], t0, dT, left) d['end'] = self.trimTimeVal(d['end'], t0, dT, left) + d['length'] = d['end'] - d['start'] if('ftrace' in d): cg = d['ftrace'] cg.start = self.trimTimeVal(cg.start, t0, dT, left) @@ -1166,30 +1212,51 @@ class Data: tm = self.trimTimeVal(tm, t0, dT, left) list.append((type, tm, idx1, idx2)) self.errorinfo[dir] = list - def normalizeTime(self, tZero): + def trimFreezeTime(self, tZero): # trim out any standby or freeze clock time - if(self.tSuspended != self.tResumed): - if(self.tResumed > tZero): - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, True) - else: - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, False) + lp = '' + for phase in self.sortedPhases(): + if 'resume_machine' in phase and 'suspend_machine' in lp: + tS, tR = self.dmesg[lp]['end'], self.dmesg[phase]['start'] + tL = tR - tS + if tL > 0: + left = True if tR > tZero else False + self.trimTime(tS, tL, left) + self.tLow.append('%.0f'%(tL*1000)) + lp = phase def getTimeValues(self): - sktime = (self.dmesg['suspend_machine']['end'] - \ - self.tKernSus) * 1000 - rktime = (self.dmesg['resume_complete']['end'] - \ - self.dmesg['resume_machine']['start']) * 1000 + sktime = (self.tSuspended - self.tKernSus) * 1000 + rktime = (self.tKernRes - self.tResumed) * 1000 return (sktime, rktime) - def setPhase(self, phase, ktime, isbegin): + def setPhase(self, phase, ktime, isbegin, order=-1): if(isbegin): + # phase start over current phase + if self.currphase: + if 'resume_machine' not in self.currphase: + sysvals.vprint('WARNING: phase %s failed to end' % self.currphase) + self.dmesg[self.currphase]['end'] = ktime + phases = self.dmesg.keys() + color = self.phasedef[phase]['color'] + count = len(phases) if order < 0 else order + # create unique name for every new phase + while phase in phases: + phase += '*' + self.dmesg[phase] = {'list': dict(), 'start': -1.0, 'end': -1.0, + 'row': 0, 'color': color, 'order': count} self.dmesg[phase]['start'] = ktime + self.currphase = phase else: + # phase end without a start + if phase not in self.currphase: + if self.currphase: + sysvals.vprint('WARNING: %s ended instead of %s, ftrace corruption?' % (phase, self.currphase)) + else: + sysvals.vprint('WARNING: %s ended without a start, ftrace corruption?' % phase) + return phase + phase = self.currphase self.dmesg[phase]['end'] = ktime - def dmesgSortVal(self, phase): - return self.dmesg[phase]['order'] - def sortedPhases(self): - return sorted(self.dmesg, key=self.dmesgSortVal) + self.currphase = '' + return phase def sortedDevices(self, phase): list = self.dmesg[phase]['list'] slist = [] @@ -1208,13 +1275,13 @@ class Data: for devname in phaselist: dev = phaselist[devname] if(dev['end'] < 0): - for p in self.phases: + for p in self.sortedPhases(): if self.dmesg[p]['end'] > dev['start']: dev['end'] = self.dmesg[p]['end'] break sysvals.vprint('%s (%s): callback didnt return' % (devname, phase)) def deviceFilter(self, devicefilter): - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] rmlist = [] for name in list: @@ -1229,7 +1296,7 @@ class Data: del list[name] def fixupInitcallsThatDidntReturn(self): # if any calls never returned, clip them at system resume end - for phase in self.phases: + for phase in self.sortedPhases(): self.fixupInitcalls(phase) def phaseOverlap(self, phases): rmgroups = [] @@ -1248,17 +1315,18 @@ class Data: self.devicegroups.append(newgroup) def newActionGlobal(self, name, start, end, pid=-1, color=''): # which phase is this device callback or action in + phases = self.sortedPhases() targetphase = 'none' htmlclass = '' overlap = 0.0 - phases = [] - for phase in self.phases: + myphases = [] + for phase in phases: pstart = self.dmesg[phase]['start'] pend = self.dmesg[phase]['end'] # see if the action overlaps this phase o = max(0, min(end, pend) - max(start, pstart)) if o > 0: - phases.append(phase) + myphases.append(phase) # set the target phase to the one that overlaps most if o > overlap: if overlap > 0 and phase == 'post_resume': @@ -1267,19 +1335,19 @@ class Data: overlap = o # if no target phase was found, pin it to the edge if targetphase == 'none': - p0start = self.dmesg[self.phases[0]]['start'] + p0start = self.dmesg[phases[0]]['start'] if start <= p0start: - targetphase = self.phases[0] + targetphase = phases[0] else: - targetphase = self.phases[-1] + targetphase = phases[-1] if pid == -2: htmlclass = ' bg' elif pid == -3: htmlclass = ' ps' - if len(phases) > 1: + if len(myphases) > 1: htmlclass = ' bg' - self.phaseOverlap(phases) - if targetphase in self.phases: + self.phaseOverlap(myphases) + if targetphase in phases: newname = self.newAction(targetphase, name, pid, '', start, end, '', htmlclass, color) return (targetphase, newname) return False @@ -1311,19 +1379,43 @@ class Data: if(list[child]['par'] == devname): devlist.append(child) return devlist + def maxDeviceNameSize(self, phase): + size = 0 + for name in self.dmesg[phase]['list']: + if len(name) > size: + size = len(name) + return size def printDetails(self): sysvals.vprint('Timeline Details:') sysvals.vprint(' test start: %f' % self.start) sysvals.vprint('kernel suspend start: %f' % self.tKernSus) - for phase in self.phases: - dc = len(self.dmesg[phase]['list']) - sysvals.vprint(' %16s: %f - %f (%d devices)' % (phase, \ - self.dmesg[phase]['start'], self.dmesg[phase]['end'], dc)) + tS = tR = False + for phase in self.sortedPhases(): + devlist = self.dmesg[phase]['list'] + dc, ps, pe = len(devlist), self.dmesg[phase]['start'], self.dmesg[phase]['end'] + if not tS and ps >= self.tSuspended: + sysvals.vprint(' machine suspended: %f' % self.tSuspended) + tS = True + if not tR and ps >= self.tResumed: + sysvals.vprint(' machine resumed: %f' % self.tResumed) + tR = True + sysvals.vprint('%20s: %f - %f (%d devices)' % (phase, ps, pe, dc)) + if sysvals.devdump: + sysvals.vprint(''.join('-' for i in range(80))) + maxname = '%d' % self.maxDeviceNameSize(phase) + fmt = '%3d) %'+maxname+'s - %f - %f' + c = 1 + for name in devlist: + s = devlist[name]['start'] + e = devlist[name]['end'] + sysvals.vprint(fmt % (c, name, s, e)) + c += 1 + sysvals.vprint(''.join('-' for i in range(80))) sysvals.vprint(' kernel resume end: %f' % self.tKernRes) sysvals.vprint(' test end: %f' % self.end) def deviceChildrenAllPhases(self, devname): devlist = [] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.deviceChildren(devname, phase) for dev in list: if dev not in devlist: @@ -1344,7 +1436,7 @@ class Data: if node.name: info = '' drv = '' - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if node.name in list: s = list[node.name]['start'] @@ -1478,8 +1570,29 @@ class Data: c = self.addProcessUsageEvent(ps, tres) if c > 0: sysvals.vprint('%25s (res): %d' % (ps, c)) + def handleEndMarker(self, time): + dm = self.dmesg + self.setEnd(time) + self.initDevicegroups() + # give suspend_prepare an end if needed + if 'suspend_prepare' in dm and dm['suspend_prepare']['end'] < 0: + dm['suspend_prepare']['end'] = time + # assume resume machine ends at next phase start + if 'resume_machine' in dm and dm['resume_machine']['end'] < 0: + np = self.nextPhase('resume_machine', 1) + if np: + dm['resume_machine']['end'] = dm[np]['start'] + # if kernel resume end not found, assume its the end marker + if self.tKernRes == 0.0: + self.tKernRes = time + # if kernel suspend start not found, assume its the end marker + if self.tKernSus == 0.0: + self.tKernSus = time + # set resume complete to end at end marker + if 'resume_complete' in dm: + dm['resume_complete']['end'] = time def debugPrint(self): - for p in self.phases: + for p in self.sortedPhases(): list = self.dmesg[p]['list'] for devname in list: dev = list[devname] @@ -1490,9 +1603,9 @@ class Data: # Description: # A container for kprobe function data we want in the dev timeline class DevFunction: - row = 0 - count = 1 def __init__(self, name, args, caller, ret, start, end, u, proc, pid, color): + self.row = 0 + self.count = 1 self.name = name self.args = args self.caller = caller @@ -1546,16 +1659,15 @@ class DevFunction: # suspend_resume: phase or custom exec block data # device_pm_callback: device callback info class FTraceLine: - time = 0.0 - length = 0.0 - fcall = False - freturn = False - fevent = False - fkprobe = False - depth = 0 - name = '' - type = '' def __init__(self, t, m='', d=''): + self.length = 0.0 + self.fcall = False + self.freturn = False + self.fevent = False + self.fkprobe = False + self.depth = 0 + self.name = '' + self.type = '' self.time = float(t) if not m and not d: return @@ -1633,13 +1745,13 @@ class FTraceLine: return len(str)/2 def debugPrint(self, info=''): if self.isLeaf(): - print(' -- %12.6f (depth=%02d): %s(); (%.3f us) %s' % (self.time, \ + pprint(' -- %12.6f (depth=%02d): %s(); (%.3f us) %s' % (self.time, \ self.depth, self.name, self.length*1000000, info)) elif self.freturn: - print(' -- %12.6f (depth=%02d): %s} (%.3f us) %s' % (self.time, \ + pprint(' -- %12.6f (depth=%02d): %s} (%.3f us) %s' % (self.time, \ self.depth, self.name, self.length*1000000, info)) else: - print(' -- %12.6f (depth=%02d): %s() { (%.3f us) %s' % (self.time, \ + pprint(' -- %12.6f (depth=%02d): %s() { (%.3f us) %s' % (self.time, \ self.depth, self.name, self.length*1000000, info)) def startMarker(self): # Is this the starting line of a suspend? @@ -1675,19 +1787,13 @@ class FTraceLine: # Each instance is tied to a single device in a single phase, and is # comprised of an ordered list of FTraceLine objects class FTraceCallGraph: - id = '' - start = -1.0 - end = -1.0 - list = [] - invalid = False - depth = 0 - pid = 0 - name = '' - partial = False vfname = 'missing_function_name' - ignore = False - sv = 0 def __init__(self, pid, sv): + self.id = '' + self.invalid = False + self.name = '' + self.partial = False + self.ignore = False self.start = -1.0 self.end = -1.0 self.list = [] @@ -1786,7 +1892,7 @@ class FTraceCallGraph: if warning and ('[make leaf]', line) not in info: info.append(('', line)) if warning: - print 'WARNING: ftrace data missing, corrections made:' + pprint('WARNING: ftrace data missing, corrections made:') for i in info: t, obj = i if obj: @@ -1846,10 +1952,10 @@ class FTraceCallGraph: id = 'task %s' % (self.pid) window = '(%f - %f)' % (self.start, line.time) if(self.depth < 0): - print('Data misalignment for '+id+\ + pprint('Data misalignment for '+id+\ ' (buffer overflow), ignoring this callback') else: - print('Too much data for '+id+\ + pprint('Too much data for '+id+\ ' '+window+', ignoring this callback') def slice(self, dev): minicg = FTraceCallGraph(dev['pid'], self.sv) @@ -1902,7 +2008,7 @@ class FTraceCallGraph: elif l.isReturn(): if(l.depth not in stack): if self.sv.verbose: - print 'Post Process Error: Depth missing' + pprint('Post Process Error: Depth missing') l.debugPrint() return False # calculate call length from call/return lines @@ -1919,7 +2025,7 @@ class FTraceCallGraph: return True elif(cnt < 0): if self.sv.verbose: - print 'Post Process Error: Depth is less than 0' + pprint('Post Process Error: Depth is less than 0') return False # trace ended before call tree finished return self.repair(cnt) @@ -1943,7 +2049,7 @@ class FTraceCallGraph: dev['ftrace'] = cg found = devname return found - for p in data.phases: + for p in data.sortedPhases(): if(data.dmesg[p]['start'] <= self.start and self.start <= data.dmesg[p]['end']): list = data.dmesg[p]['list'] @@ -1966,7 +2072,7 @@ class FTraceCallGraph: if fs < data.start or fe > data.end: return phase = '' - for p in data.phases: + for p in data.sortedPhases(): if(data.dmesg[p]['start'] <= self.start and self.start < data.dmesg[p]['end']): phase = p @@ -1978,20 +2084,20 @@ class FTraceCallGraph: phase, myname = out data.dmesg[phase]['list'][myname]['ftrace'] = self def debugPrint(self, info=''): - print('%s pid=%d [%f - %f] %.3f us') % \ + pprint('%s pid=%d [%f - %f] %.3f us' % \ (self.name, self.pid, self.start, self.end, - (self.end - self.start)*1000000) + (self.end - self.start)*1000000)) for l in self.list: if l.isLeaf(): - print('%f (%02d): %s(); (%.3f us)%s' % (l.time, \ + pprint('%f (%02d): %s(); (%.3f us)%s' % (l.time, \ l.depth, l.name, l.length*1000000, info)) elif l.freturn: - print('%f (%02d): %s} (%.3f us)%s' % (l.time, \ + pprint('%f (%02d): %s} (%.3f us)%s' % (l.time, \ l.depth, l.name, l.length*1000000, info)) else: - print('%f (%02d): %s() { (%.3f us)%s' % (l.time, \ + pprint('%f (%02d): %s() { (%.3f us)%s' % (l.time, \ l.depth, l.name, l.length*1000000, info)) - print(' ') + pprint(' ') class DevItem: def __init__(self, test, phase, dev): @@ -2008,23 +2114,20 @@ class DevItem: # A container for a device timeline which calculates # all the html properties to display it correctly class Timeline: - html = '' - height = 0 # total timeline height - scaleH = 20 # timescale (top) row height - rowH = 30 # device row height - bodyH = 0 # body height - rows = 0 # total timeline rows - rowlines = dict() - rowheight = dict() html_tblock = '

    \n' html_device = '
    {6}
    \n' html_phase = '
    {5}
    \n' html_phaselet = '
    \n' html_legend = '
     {2}
    \n' def __init__(self, rowheight, scaleheight): - self.rowH = rowheight - self.scaleH = scaleheight self.html = '' + self.height = 0 # total timeline height + self.scaleH = scaleheight # timescale (top) row height + self.rowH = rowheight # device row height + self.bodyH = 0 # body height + self.rows = 0 # total timeline rows + self.rowlines = dict() + self.rowheight = dict() def createHeader(self, sv, stamp): if(not stamp['time']): return @@ -2251,18 +2354,18 @@ class Timeline: # Description: # A list of values describing the properties of these test runs class TestProps: - stamp = '' - sysinfo = '' - cmdline = '' - kparams = '' - S0i3 = False - fwdata = [] stampfmt = '# [a-z]*-(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})-'+\ '(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})'+\ ' (?P.*) (?P.*) (?P.*)$' + batteryfmt = '^# battery (?P\w*) (?P\d*) (?P\w*) (?P\d*)' + testerrfmt = '^# enter_sleep_error (?P.*)' sysinfofmt = '^# sysinfo .*' cmdlinefmt = '^# command \| (?P.*)' kparamsfmt = '^# kparams \| (?P.*)' + devpropfmt = '# Device Properties: .*' + tracertypefmt = '# tracer: (?P.*)' + firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' + procexecfmt = 'ps - (?P.*)$' ftrace_line_fmt_fg = \ '^ *(?P