Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 23:28:08 +0000 (17:28 -0600)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 23:28:08 +0000 (17:28 -0600)
Pull Sparc fixes from David Miller:
 "Hook up the memfd syscall, and properly claim all PCI resources
  discovered when building the PCI device tree"

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc:
  sparc: Hook up memfd_create system call.
  sparc64: Properly claim resources as each PCI bus is probed.
  sparc64: Skip bogus PCI bridge ranges.
  sparc64: Expand PCI bridge probing debug logging.

319 files changed:
.gitignore
Documentation/device-mapper/switch.txt
Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt
Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
Documentation/devicetree/bindings/mmc/mmc.txt
Documentation/devicetree/bindings/mmc/renesas,mmcif.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/sdhci-msm.txt
Documentation/devicetree/bindings/mmc/sdhci-st.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
Documentation/devicetree/bindings/mmc/tmio_mmc.txt
Documentation/devicetree/changesets.txt [new file with mode: 0644]
Documentation/devicetree/todo.txt [new file with mode: 0644]
Documentation/infiniband/user_mad.txt
Documentation/kbuild/00-INDEX
Documentation/kbuild/headers_install.txt [moved from Documentation/make/headers_install.txt with 100% similarity]
Documentation/kbuild/makefiles.txt
Makefile
arch/arm/boot/dts/versatile-ab.dts
arch/arm/boot/dts/versatile-pb.dts
arch/arm/xen/grant-table.c
arch/ia64/Makefile
arch/powerpc/boot/gunzip_util.c
arch/powerpc/include/asm/cputable.h
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/include/asm/pte-hash64-64k.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/spinlock.h
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_44x.S
arch/powerpc/kernel/iommu.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/smp.c
arch/powerpc/lib/locks.c
arch/powerpc/mm/hash_native_64.c
arch/powerpc/mm/hugepage-hash64.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/tlb_hash64.c
arch/powerpc/mm/tlb_nohash.c
arch/powerpc/perf/hv-24x7.c
arch/powerpc/platforms/powermac/feature.c
arch/powerpc/platforms/powermac/pci.c
arch/powerpc/platforms/powermac/smp.c
arch/powerpc/platforms/powermac/udbg_adb.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/pseries/hotplug-memory.c
arch/powerpc/platforms/pseries/hvcserver.c
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/xmon/xmon.c
arch/x86/xen/grant-table.c
arch/x86/xen/time.c
block/bio-integrity.c
block/bio.c
block/blk-core.c
block/blk-mq.c
block/blk-mq.h
block/blk-sysfs.c
block/compat_ioctl.c
block/ioctl.c
block/partitions/aix.c
block/partitions/amiga.c
block/partitions/efi.c
block/partitions/msdos.c
block/scsi_ioctl.c
drivers/block/drbd/Makefile
drivers/block/drbd/drbd_actlog.c
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_debugfs.c [new file with mode: 0644]
drivers/block/drbd/drbd_debugfs.h [new file with mode: 0644]
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_interval.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_proc.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_req.h
drivers/block/drbd/drbd_state.c
drivers/block/drbd/drbd_worker.c
drivers/block/virtio_blk.c
drivers/cpufreq/pmac64-cpufreq.c
drivers/crypto/nx/nx-842.c
drivers/edac/cell_edac.c
drivers/hwmon/adm1025.c
drivers/hwmon/adm1026.c
drivers/hwmon/ads1015.c
drivers/hwmon/asb100.c
drivers/hwmon/dme1737.c
drivers/hwmon/emc6w201.c
drivers/hwmon/hih6130.c
drivers/hwmon/lm87.c
drivers/hwmon/lm92.c
drivers/hwmon/pc87360.c
drivers/hwmon/tmp103.c
drivers/hwmon/vt1211.c
drivers/hwmon/w83627hf.c
drivers/hwmon/w83791d.c
drivers/hwmon/w83793.c
drivers/hwspinlock/Kconfig
drivers/hwspinlock/omap_hwspinlock.c
drivers/infiniband/core/agent.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/iwcm.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_priv.h
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/hw/amso1100/c2_cq.c
drivers/infiniband/hw/cxgb4/ev.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/cxgb4/t4.h
drivers/infiniband/hw/ipath/ipath_mad.c
drivers/infiniband/hw/mlx4/mad.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mthca/mthca_mad.c
drivers/infiniband/hw/ocrdma/ocrdma.h
drivers/infiniband/hw/ocrdma/ocrdma_ah.c
drivers/infiniband/hw/ocrdma/ocrdma_hw.c
drivers/infiniband/hw/ocrdma/ocrdma_hw.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_sli.h
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/qib/qib_mad.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_fs.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/bset.c
drivers/md/bcache/bset.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/extents.c
drivers/md/bcache/extents.h
drivers/md/bcache/journal.c
drivers/md/bcache/request.c
drivers/md/bcache/super.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c
drivers/md/bcache/writeback.h
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-metadata.h
drivers/md/dm-cache-target.c
drivers/md/dm-crypt.c
drivers/md/dm-io.c
drivers/md/dm-mpath.c
drivers/md/dm-switch.c
drivers/md/dm-table.c
drivers/md/dm-thin.c
drivers/md/dm.h
drivers/mfd/rtsx_usb.c
drivers/mmc/card/block.c
drivers/mmc/core/bus.c
drivers/mmc/core/core.c
drivers/mmc/core/mmc.c
drivers/mmc/core/quirks.c
drivers/mmc/core/sd_ops.c
drivers/mmc/host/Kconfig
drivers/mmc/host/Makefile
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/dw_mmc.h
drivers/mmc/host/mmci.c
drivers/mmc/host/mmci.h
drivers/mmc/host/moxart-mmc.c
drivers/mmc/host/mxs-mmc.c
drivers/mmc/host/omap_hsmmc.c
drivers/mmc/host/s3cmci.c
drivers/mmc/host/s3cmci.h
drivers/mmc/host/sdhci-acpi.c
drivers/mmc/host/sdhci-msm.c
drivers/mmc/host/sdhci-pci.c
drivers/mmc/host/sdhci-pci.h
drivers/mmc/host/sdhci-pxav3.c
drivers/mmc/host/sdhci-st.c [new file with mode: 0644]
drivers/mmc/host/sdhci-tegra.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sh_mmcif.c
drivers/mmc/host/tmio_mmc_dma.c
drivers/mmc/host/wmt-sdmmc.c
drivers/net/ethernet/apm/xgene/xgene_enet_main.c
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/emulex/benet/be.h
drivers/net/ethernet/emulex/benet/be_main.c
drivers/net/ethernet/emulex/benet/be_roce.c
drivers/net/ethernet/emulex/benet/be_roce.h
drivers/net/ethernet/ibm/ehea/Makefile
drivers/net/ethernet/intel/e1000e/manage.c
drivers/net/ethernet/intel/i40e/i40e_fcoe.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_nvm.c
drivers/net/ethernet/mellanox/mlx4/cmd.c
drivers/net/ethernet/mellanox/mlx4/fw.c
drivers/net/ethernet/mellanox/mlx4/main.c
drivers/net/ethernet/mellanox/mlx4/mlx4.h
drivers/net/ethernet/mellanox/mlx4/mr.c
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
drivers/net/ethernet/myricom/myri10ge/myri10ge.c
drivers/net/ethernet/sun/sunvnet.c
drivers/net/ethernet/sun/sunvnet.h
drivers/net/ethernet/xilinx/ll_temac_main.c
drivers/net/ethernet/xilinx/xilinx_axienet_main.c
drivers/net/ethernet/xilinx/xilinx_emaclite.c
drivers/net/irda/donauboe.c
drivers/net/macvlan.c
drivers/net/wireless/ath/carl9170/carl9170.h
drivers/net/wireless/ath/carl9170/usb.c
drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c
drivers/net/wireless/brcm80211/brcmfmac/pcie.c
drivers/net/wireless/ipw2x00/ipw2200.c
drivers/net/wireless/iwlwifi/mvm/mac80211.c
drivers/net/xen-netback/common.h
drivers/net/xen-netback/interface.c
drivers/net/xen-netback/netback.c
drivers/net/xen-netback/xenbus.c
drivers/of/Kconfig
drivers/of/Makefile
drivers/of/base.c
drivers/of/device.c
drivers/of/dynamic.c [new file with mode: 0644]
drivers/of/fdt.c
drivers/of/of_private.h
drivers/of/of_reserved_mem.c
drivers/of/platform.c
drivers/of/selftest.c
drivers/of/testcase-data/testcases.dts [new file with mode: 0644]
drivers/of/testcase-data/testcases.dtsi [deleted file]
drivers/pci/hotplug/rpaphp_core.c
drivers/scsi/cxgbi/cxgb3i/Kconfig
drivers/scsi/cxgbi/cxgb4i/Kconfig
drivers/scsi/scsi_transport_srp.c
drivers/tty/ehv_bytechan.c
drivers/tty/hvc/hvc_opal.c
drivers/tty/hvc/hvc_vio.c
drivers/tty/serial/pmac_zilog.c
drivers/tty/serial/serial_core.c
drivers/vfio/Kconfig
drivers/vfio/Makefile
drivers/vfio/pci/vfio_pci.c
drivers/vfio/pci/vfio_pci_private.h
drivers/vfio/vfio_spapr_eeh.c
include/linux/bio.h
include/linux/blkdev.h
include/linux/drbd.h
include/linux/drbd_genl.h
include/linux/drbd_limits.h
include/linux/mlx4/cmd.h
include/linux/mlx4/device.h
include/linux/mmc/dw_mmc.h
include/linux/mmc/sdhci.h
include/linux/of.h
include/linux/of_platform.h
include/linux/of_reserved_mem.h
include/linux/platform_data/mmc-omap.h
include/linux/printk.h
include/linux/rhashtable.h
include/linux/vfio.h
include/net/inet_connection_sock.h
include/net/sock.h
include/net/tcp.h
include/rdma/ib_mad.h
include/rdma/ib_verbs.h
include/scsi/sg.h
include/trace/events/bcache.h
include/trace/events/thp.h [new file with mode: 0644]
include/uapi/linux/bsg.h
include/uapi/linux/virtio_blk.h
include/uapi/rdma/ib_user_mad.h
include/uapi/rdma/ib_user_verbs.h
include/uapi/rdma/rdma_user_cm.h
init/Kconfig
kernel/fork.c
kernel/printk/printk.c
kernel/seccomp.c
kernel/time/timekeeping.c
lib/Kconfig.debug
lib/lru_cache.c
lib/rhashtable.c
mm/hugetlb_cgroup.c
net/atm/lec.c
net/atm/svc.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_metrics.c
net/ipv4/tcp_output.c
net/ipv6/sit.c
net/ipv6/tcp_ipv6.c
net/irda/irlap_frame.c
net/netlink/af_netlink.c
net/openvswitch/vport.c
scripts/Kbuild.include
scripts/Makefile.clean
scripts/Makefile.extrawarn
scripts/Makefile.host
scripts/coccinelle/api/alloc/alloc_cast.cocci [new file with mode: 0644]
scripts/coccinelle/misc/array_size.cocci [new file with mode: 0644]
scripts/coccinelle/misc/badty.cocci [new file with mode: 0644]
scripts/coccinelle/misc/bugon.cocci [moved from scripts/coccinelle/api/alloc/drop_kmalloc_cast.cocci with 50% similarity]
scripts/coccinelle/null/badzero.cocci
sound/ppc/pmac.c

index f4c0b091dcf4e6413cbe70f0ec345a5894e0885e..e213b27f3921a88768201638d6d358f0f0c80419 100644 (file)
@@ -34,6 +34,7 @@
 *.gcno
 modules.builtin
 Module.symvers
+*.dwo
 
 #
 # Top-level generic files
index 2fa749387be807ade453ba645fbcf8250afffb14..8897d04948384289b3fca54801be9676c15ce0e5 100644 (file)
@@ -106,6 +106,11 @@ which paths.
     The path number in the range 0 ... (<num_paths> - 1).
     Expressed in hexadecimal (WITHOUT any prefix like 0x).
 
+R<n>,<m>
+    This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
+    are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
+    slots.
+
 Status
 ======
 
@@ -124,3 +129,10 @@ Create a switch device with 64kB region size:
 Set mappings for the first 7 entries to point to devices switch0, switch1,
 switch2, switch0, switch1, switch2, switch1:
     dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
+
+Set repetitive mapping. This command:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
+is equivalent to:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
+       :1 :2 :1 :2 :1 :2 :1 :2 :1 :2
+
index 532b1d440abc15d1f1d1e61791b274a6ec8dafe0..6cd3525d0e09514acedaf89c1c1abeeffad7cf95 100644 (file)
@@ -46,13 +46,14 @@ Required Properties:
       - if CIU clock divider value is 0 (that is divide by 1), both tx and rx
         phase shift clocks should be 0.
 
-Required properties for a slot:
+Required properties for a slot (Deprecated - Recommend to use one slot per host):
 
 * gpios: specifies a list of gpios used for command, clock and data bus. The
   first gpio is the command line and the second gpio is the clock line. The
   rest of the gpios (depending on the bus-width property) are the data lines in
   no particular order. The format of the gpio specifier depends on the gpio
   controller.
+(Deprecated - Refer to Documentation/devicetree/binding/pinctrl/samsung-pinctrl.txt)
 
 Example:
 
@@ -69,21 +70,13 @@ Example:
 
        dwmmc0@12200000 {
                num-slots = <1>;
-               supports-highspeed;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
                broken-cd;
                fifo-depth = <0x80>;
                card-detect-delay = <200>;
                samsung,dw-mshc-ciu-div = <3>;
                samsung,dw-mshc-sdr-timing = <2 3>;
                samsung,dw-mshc-ddr-timing = <1 2>;
-
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <8>;
-                       gpios = <&gpc0 0 2 0 3>, <&gpc0 1 2 0 3>,
-                               <&gpc1 0 2 3 3>, <&gpc1 1 2 3 3>,
-                               <&gpc1 2 2 3 3>, <&gpc1 3 2 3 3>,
-                               <&gpc0 3 2 3 3>, <&gpc0 4 2 3 3>,
-                               <&gpc0 5 2 3 3>, <&gpc0 6 2 3 3>;
-               };
+               bus-width = <8>;
        };
index e5bc49f764d10a982efe9a8a92f9327b19f8e575..3b3544931437accded12ada0ec38c786441d1a02 100644 (file)
@@ -34,13 +34,11 @@ Example:
                num-slots = <1>;
                vmmc-supply = <&ldo12>;
                fifo-depth = <0x100>;
-               supports-highspeed;
                pinctrl-names = "default";
                pinctrl-0 = <&sd_pmx_pins &sd_cfg_func1 &sd_cfg_func2>;
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <4>;
-                       disable-wp;
-                       cd-gpios = <&gpio10 3 0>;
-               };
+               bus-width = <4>;
+               disable-wp;
+               cd-gpios = <&gpio10 3 0>;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
        };
index 3c18001dfd5d75fe91038926fd6da8f98637fe88..431716e37a3964638245b2905badff16128b9495 100644 (file)
@@ -34,8 +34,8 @@ Optional properties:
 - cap-power-off-card: powering off the card is safe
 - cap-sdio-irq: enable SDIO IRQ signalling on this interface
 - full-pwr-cycle: full power cycle of the card is supported
-- mmc-highspeed-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
-- mmc-highspeed-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
+- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
+- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
 - mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported
 - mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported
 - mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported
diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt
new file mode 100644 (file)
index 0000000..299081f
--- /dev/null
@@ -0,0 +1,32 @@
+* Renesas Multi Media Card Interface (MMCIF) Controller
+
+This file documents differences between the core properties in mmc.txt
+and the properties used by the MMCIF device.
+
+
+Required properties:
+
+- compatible: must contain one of the following
+       - "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs
+       - "renesas,mmcif-r8a7790" for the MMCIF found in r8a7790 SoCs
+       - "renesas,mmcif-r8a7791" for the MMCIF found in r8a7791 SoCs
+       - "renesas,sh-mmcif" for the generic MMCIF
+
+- clocks: reference to the functional clock
+
+- dmas: reference to the DMA channels, one per channel name listed in the
+  dma-names property.
+- dma-names: must contain "tx" for the transmit DMA channel and "rx" for the
+  receive DMA channel.
+
+
+Example: R8A7790 (R-Car H2) MMCIF0
+
+       mmcif0: mmc@ee200000 {
+               compatible = "renesas,mmcif-r8a7790", "renesas,sh-mmcif";
+               reg = <0 0xee200000 0 0x80>;
+               interrupts = <0 169 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&mstp3_clks R8A7790_CLK_MMCIF0>;
+               dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
+               dma-names = "tx", "rx";
+       };
index 81b33b5b20fc07fa1ffeaf3e61e48dfb8263f555..485483a63d8ce3f44a82d30d56c283c8f1034dff 100644 (file)
@@ -27,8 +27,8 @@ Example:
                bus-width = <8>;
                non-removable;
 
-               vmmc = <&pm8941_l20>;
-               vqmmc = <&pm8941_s3>;
+               vmmc-supply = <&pm8941_l20>;
+               vqmmc-supply = <&pm8941_s3>;
 
                pinctrl-names = "default";
                pinctrl-0 = <&sdc1_clk &sdc1_cmd &sdc1_data>;
@@ -44,8 +44,8 @@ Example:
                bus-width = <4>;
                cd-gpios = <&msmgpio 62 0x1>;
 
-               vmmc = <&pm8941_l21>;
-               vqmmc = <&pm8941_l13>;
+               vmmc-supply = <&pm8941_l21>;
+               vqmmc-supply = <&pm8941_l13>;
 
                pinctrl-names = "default";
                pinctrl-0 = <&sdc2_clk &sdc2_cmd &sdc2_data>;
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-st.txt b/Documentation/devicetree/bindings/mmc/sdhci-st.txt
new file mode 100644 (file)
index 0000000..7527db4
--- /dev/null
@@ -0,0 +1,33 @@
+* STMicroelectronics sdhci-st MMC/SD controller
+
+This file documents the differences between the core properties in
+Documentation/devicetree/bindings/mmc/mmc.txt and the properties
+used by the sdhci-st driver.
+
+Required properties:
+- compatible :  Must be "st,sdhci"
+- clock-names : Should be "mmc"
+                See: Documentation/devicetree/bindings/resource-names.txt
+- clocks :      Phandle of the clock used by the sdhci controler
+                See: Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Optional properties:
+- non-removable: non-removable slot
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+- bus-width: Number of data lines
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+
+Example:
+
+mmc0: sdhci@fe81e000 {
+       compatible      = "st,sdhci";
+       status          = "disabled";
+       reg             = <0xfe81e000 0x1000>;
+       interrupts      = <GIC_SPI 127 IRQ_TYPE_NONE>;
+       interrupt-names = "mmcirq";
+       pinctrl-names   = "default";
+       pinctrl-0       = <&pinctrl_mmc0>;
+       clock-names     = "mmc";
+       clocks          = <&clk_s_a1_ls 1>;
+       bus-width       = <8>
+};
index 2d4a7258a10db9d2c30dc808bb7b5ad5e74fa994..346c6095a6155138ad01f2d39374a8ed97224d12 100644 (file)
@@ -67,7 +67,8 @@ Optional properties:
 * card-detect-delay: Delay in milli-seconds before detecting card after card
   insert event. The default value is 0.
 
-* supports-highspeed: Enables support for high speed cards (up to 50MHz)
+* supports-highspeed (DEPRECATED): Enables support for high speed cards (up to 50MHz)
+                          (use "cap-mmc-highspeed" or "cap-sd-highspeed" instead)
 
 * broken-cd: as documented in mmc core bindings.
 
@@ -98,14 +99,11 @@ board specific portions as listed below.
                clock-frequency = <400000000>;
                clock-freq-min-max = <400000 200000000>;
                num-slots = <1>;
-               supports-highspeed;
                broken-cd;
                fifo-depth = <0x80>;
                card-detect-delay = <200>;
                vmmc-supply = <&buck8>;
-
-               slot@0 {
-                       reg = <0>;
-                       bus-width = <8>;
-               };
+               bus-width = <8>;
+               cap-mmc-highspeed;
+               cap-sd-highspeed;
        };
index ce8056116fb0bd9c281ec4a2d60844e9eb551953..76bf087bc8898fc82f9b7d48c94cce498a85be50 100644 (file)
@@ -12,6 +12,7 @@ Required properties:
  Should be "ti,omap3-hsmmc", for OMAP3 controllers
  Should be "ti,omap3-pre-es3-hsmmc" for OMAP3 controllers pre ES3.0
  Should be "ti,omap4-hsmmc", for OMAP4 controllers
+ Should be "ti,am33xx-hsmmc", for AM335x controllers
 - ti,hwmods: Must be "mmc<n>", n is controller instance starting 1
 
 Optional properties:
@@ -56,3 +57,56 @@ Examples:
                        &edma 25>;
                dma-names = "tx", "rx";
        };
+
+[workaround for missing swakeup on am33xx]
+
+This SOC is missing the swakeup line, it will not detect SDIO irq
+while in suspend.
+
+                             ------
+                             | PRCM |
+                              ------
+                               ^ |
+                       swakeup | | fclk
+                               | v
+       ------                -------               -----
+      | card | -- CIRQ -->  | hsmmc | -- IRQ -->  | CPU |
+       ------                -------               -----
+
+In suspend the fclk is off and the module is disfunctional. Even register reads
+will fail. A small logic in the host will request fclk restore, when an
+external event is detected. Once the clock is restored, the host detects the
+event normally. Since am33xx doesn't have this line it never wakes from
+suspend.
+
+The workaround is to reconfigure the dat1 line as a GPIO upon suspend. To make
+this work, we need to set the named pinctrl states "default" and "idle".
+Prepare idle to remux dat1 as a gpio, and default to remux it back as sdio
+dat1. The MMC driver will then toggle between idle and default state during
+runtime.
+
+In summary:
+1. select matching 'compatible' section, see example below.
+2. specify pinctrl states "default" and "idle", "sleep" is optional.
+3. specify the gpio irq used for detecting sdio irq in suspend
+
+If configuration is incomplete, a warning message is emitted "falling back to
+polling". Also check the "sdio irq mode" in /sys/kernel/debug/mmc0/regs. Mind
+not every application needs SDIO irq, e.g. MMC cards.
+
+       mmc1: mmc@48060100 {
+               compatible = "ti,am33xx-hsmmc";
+               ...
+               pinctrl-names = "default", "idle", "sleep"
+               pinctrl-0 = <&mmc1_pins>;
+               pinctrl-1 = <&mmc1_idle>;
+               pinctrl-2 = <&mmc1_sleep>;
+               ...
+               interrupts-extended = <&intc 64 &gpio2 28 0>;
+       };
+
+       mmc1_idle : pinmux_cirq_pin {
+               pinctrl-single,pins = <
+                       0x0f8 0x3f      /* GPIO2_28 */
+               >;
+       };
index 6a2a1160a70defdbac92be8850152f1c4448cbde..fa0f327cde01417339f8e3618f075e6ef51db12e 100644 (file)
@@ -18,6 +18,7 @@ Required properties:
                "renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
                "renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
                "renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
+               "renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
 
 Optional properties:
 - toshiba,mmc-wrprotect-disable: write-protect detection is unavailable
diff --git a/Documentation/devicetree/changesets.txt b/Documentation/devicetree/changesets.txt
new file mode 100644 (file)
index 0000000..935ba5a
--- /dev/null
@@ -0,0 +1,40 @@
+A DT changeset is a method which allows one to apply changes
+in the live tree in such a way that either the full set of changes
+will be applied, or none of them will be. If an error occurs partway
+through applying the changeset, then the tree will be rolled back to the
+previous state. A changeset can also be removed after it has been
+applied.
+
+When a changeset is applied, all of the changes get applied to the tree
+at once before emitting OF_RECONFIG notifiers. This is so that the
+receiver sees a complete and consistent state of the tree when it
+receives the notifier.
+
+The sequence of a changeset is as follows.
+
+1. of_changeset_init() - initializes a changeset
+
+2. A number of DT tree change calls, of_changeset_attach_node(),
+of_changeset_detach_node(), of_changeset_add_property(),
+of_changeset_remove_property, of_changeset_update_property() to prepare
+a set of changes. No changes to the active tree are made at this point.
+All the change operations are recorded in the of_changeset 'entries'
+list.
+
+3. mutex_lock(of_mutex) - starts a changeset; The global of_mutex
+ensures there can only be one editor at a time.
+
+4. of_changeset_apply() - Apply the changes to the tree. Either the
+entire changeset will get applied, or if there is an error the tree will
+be restored to the previous state
+
+5. mutex_unlock(of_mutex) - All operations complete, release the mutex
+
+If a successfully applied changeset needs to be removed, it can be done
+with the following sequence.
+
+1. mutex_lock(of_mutex)
+
+2. of_changeset_revert()
+
+3. mutex_unlock(of_mutex)
diff --git a/Documentation/devicetree/todo.txt b/Documentation/devicetree/todo.txt
new file mode 100644 (file)
index 0000000..c3cf065
--- /dev/null
@@ -0,0 +1,11 @@
+Todo list for devicetree:
+
+=== General structure ===
+- Switch from custom lists to (h)list_head for nodes and properties structure
+- Remove of_allnodes list and iterate using list of child nodes alone
+
+=== CONFIG_OF_DYNAMIC ===
+- Switch to RCU for tree updates and get rid of global spinlock
+- Document node lifecycle for CONFIG_OF_DYNAMIC
+- Always set ->full_name at of_attach_node() time
+- pseries: Get rid of open-coded tree modification from arch/powerpc/platforms/pseries/dlpar.c
index 8a366959f5cc2e01a190692593b6962e830bf8bd..7aca13a54a3a2beadfe53589b74cdf91af2d5cb5 100644 (file)
@@ -26,6 +26,11 @@ Creating MAD agents
   ioctl.  Also, all agents registered through a file descriptor will
   be unregistered when the descriptor is closed.
 
+  2014 -- a new registration ioctl is now provided which allows additional
+       fields to be provided during registration.
+       Users of this registration call are implicitly setting the use of
+       pkey_index (see below).
+
 Receiving MADs
 
   MADs are received using read().  The receive side now supports
@@ -104,10 +109,10 @@ P_Key Index Handling
   The old ib_umad interface did not allow setting the P_Key index for
   MADs that are sent and did not provide a way for obtaining the P_Key
   index of received MADs.  A new layout for struct ib_user_mad_hdr
-  with a pkey_index member has been defined; however, to preserve
-  binary compatibility with older applications, this new layout will
-  not be used unless the IB_USER_MAD_ENABLE_PKEY ioctl is called
-  before a file descriptor is used for anything else.
+  with a pkey_index member has been defined; however, to preserve binary
+  compatibility with older applications, this new layout will not be used
+  unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
+  are called before a file descriptor is used for anything else.
 
   In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
   to 6, the new layout of struct ib_user_mad_hdr will be used by
index e8d2b6d83a3d9045eec563d7bc2a4043ec358be4..8c5e6aa78004ab9fad62c42c2ef037457ed7c2b2 100644 (file)
@@ -1,5 +1,7 @@
 00-INDEX
        - this file: info on the kernel build process
+headers_install.txt
+       - how to export Linux headers for use by userspace
 kbuild.txt
        - developer information on kbuild
 kconfig.txt
index c600e2f44a623857f86f88eb8a021e68bda30456..764f5991a3fc74c32d9f218be85b5c977ea4efde 100644 (file)
@@ -23,11 +23,10 @@ This document describes the Linux kernel Makefiles.
        === 4 Host Program support
           --- 4.1 Simple Host Program
           --- 4.2 Composite Host Programs
-          --- 4.3 Defining shared libraries
-          --- 4.4 Using C++ for host programs
-          --- 4.5 Controlling compiler options for host programs
-          --- 4.6 When host programs are actually built
-          --- 4.7 Using hostprogs-$(CONFIG_FOO)
+          --- 4.3 Using C++ for host programs
+          --- 4.4 Controlling compiler options for host programs
+          --- 4.5 When host programs are actually built
+          --- 4.6 Using hostprogs-$(CONFIG_FOO)
 
        === 5 Kbuild clean infrastructure
 
@@ -643,29 +642,7 @@ Both possibilities are described in the following.
        Finally, the two .o files are linked to the executable, lxdialog.
        Note: The syntax <executable>-y is not permitted for host-programs.
 
---- 4.3 Defining shared libraries
-
-       Objects with extension .so are considered shared libraries, and
-       will be compiled as position independent objects.
-       Kbuild provides support for shared libraries, but the usage
-       shall be restricted.
-       In the following example the libkconfig.so shared library is used
-       to link the executable conf.
-
-       Example:
-               #scripts/kconfig/Makefile
-               hostprogs-y     := conf
-               conf-objs       := conf.o libkconfig.so
-               libkconfig-objs := expr.o type.o
-
-       Shared libraries always require a corresponding -objs line, and
-       in the example above the shared library libkconfig is composed by
-       the two objects expr.o and type.o.
-       expr.o and type.o will be built as position independent code and
-       linked as a shared library libkconfig.so. C++ is not supported for
-       shared libraries.
-
---- 4.4 Using C++ for host programs
+--- 4.3 Using C++ for host programs
 
        kbuild offers support for host programs written in C++. This was
        introduced solely to support kconfig, and is not recommended
@@ -688,7 +665,7 @@ Both possibilities are described in the following.
                qconf-cxxobjs := qconf.o
                qconf-objs    := check.o
 
---- 4.5 Controlling compiler options for host programs
+--- 4.4 Controlling compiler options for host programs
 
        When compiling host programs, it is possible to set specific flags.
        The programs will always be compiled utilising $(HOSTCC) passed
@@ -716,7 +693,7 @@ Both possibilities are described in the following.
        When linking qconf, it will be passed the extra option
        "-L$(QTDIR)/lib".
 
---- 4.6 When host programs are actually built
+--- 4.5 When host programs are actually built
 
        Kbuild will only build host-programs when they are referenced
        as a prerequisite.
@@ -747,7 +724,7 @@ Both possibilities are described in the following.
        This will tell kbuild to build lxdialog even if not referenced in
        any rule.
 
---- 4.7 Using hostprogs-$(CONFIG_FOO)
+--- 4.6 Using hostprogs-$(CONFIG_FOO)
 
        A typical pattern in a Kbuild file looks like this:
 
index a897c50db515d7adf8dec34f7c3298c9988be953..6aace6750567ba113388cdf43312d3a46d011c1b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -372,6 +372,7 @@ GENKSYMS    = scripts/genksyms/genksyms
 INSTALLKERNEL  := installkernel
 DEPMOD         = /sbin/depmod
 PERL           = perl
+PYTHON         = python
 CHECK          = sparse
 
 CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
@@ -422,7 +423,7 @@ KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(S
 export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 export CPP AR NM STRIP OBJCOPY OBJDUMP
-export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
+export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
@@ -687,6 +688,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
 # source of a reference will be _MergedGlobals and not on of the whitelisted names.
 # See modpost pattern 2
 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
+KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
 else
 
 # This warning generated too much noise in a regular build.
@@ -710,9 +712,16 @@ endif
 KBUILD_CFLAGS   += $(call cc-option, -fno-var-tracking-assignments)
 
 ifdef CONFIG_DEBUG_INFO
+ifdef CONFIG_DEBUG_INFO_SPLIT
+KBUILD_CFLAGS   += $(call cc-option, -gsplit-dwarf, -g)
+else
 KBUILD_CFLAGS  += -g
+endif
 KBUILD_AFLAGS  += -Wa,-gdwarf-2
 endif
+ifdef CONFIG_DEBUG_INFO_DWARF4
+KBUILD_CFLAGS  += $(call cc-option, -gdwarf-4,)
+endif
 
 ifdef CONFIG_DEBUG_INFO_REDUCED
 KBUILD_CFLAGS  += $(call cc-option, -femit-struct-debug-baseonly) \
@@ -1054,6 +1063,13 @@ headers_check: headers_install
        $(Q)$(MAKE) $(hdr-inst)=include/uapi HDRCHECK=1
        $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi/asm $(hdr-dst) HDRCHECK=1
 
+# ---------------------------------------------------------------------------
+# Kernel selftest
+
+PHONY += kselftest
+kselftest:
+       $(Q)$(MAKE) -C tools/testing/selftests run_tests
+
 # ---------------------------------------------------------------------------
 # Modules
 
@@ -1241,9 +1257,9 @@ help:
        @echo  '  tags/TAGS       - Generate tags file for editors'
        @echo  '  cscope          - Generate cscope index'
        @echo  '  gtags           - Generate GNU GLOBAL index'
-       @echo  '  kernelrelease   - Output the release version string'
-       @echo  '  kernelversion   - Output the version stored in Makefile'
-       @echo  '  image_name      - Output the image name'
+       @echo  '  kernelrelease   - Output the release version string (use with make -s)'
+       @echo  '  kernelversion   - Output the version stored in Makefile (use with make -s)'
+       @echo  '  image_name      - Output the image name (use with make -s)'
        @echo  '  headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
         echo  '                    (default: $(INSTALL_HDR_PATH))'; \
         echo  ''
@@ -1257,6 +1273,11 @@ help:
        @echo  '  headerdep       - Detect inclusion cycles in headers'
        @$(MAKE) -f $(srctree)/scripts/Makefile.help checker-help
        @echo  ''
+       @echo  'Kernel selftest'
+       @echo  '  kselftest       - Build and run kernel selftest (run as root)'
+       @echo  '                    Build, install, and boot kernel before'
+       @echo  '                    running kselftest on it'
+       @echo  ''
        @echo  'Kernel packaging:'
        @$(MAKE) $(build)=$(package-dir) help
        @echo  ''
@@ -1398,6 +1419,7 @@ clean: $(clean-dirs)
        @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
                \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
                -o -name '*.ko.*' \
+               -o -name '*.dwo'  \
                -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
                -o -name '*.symtypes' -o -name 'modules.order' \
                -o -name modules.builtin -o -name '.tmp_*.o.*' \
index 36c771a2d765de67cb5ccf80c1fcd629db96fd71..27d0d9c8adf3da724d9e64ba2e5229731a0b38eb 100644 (file)
                i2c0 = &i2c0;
        };
 
+       chosen {
+               stdout-path = &uart0;
+       };
+
        memory {
                reg = <0x0 0x08000000>;
        };
index d025048119d3078ee1730531071a3e3ca5189d37..e36c1e82fea74d62f7efb7cc3c3585597abd78c2 100644 (file)
@@ -56,5 +56,3 @@
                };
        };
 };
-
-#include <testcases.dtsi>
index 2c4041c9bac5e18e320d25b0577d034c52514b43..e43791829aceb2a725d45b419b36c21b54ec0300 100644 (file)
@@ -49,8 +49,3 @@ int arch_gnttab_init(unsigned long nr_shared)
 {
        return 0;
 }
-
-int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
-{
-       return 0;
-}
index f37238f45bcd0879c8c32d6f5ca42363ece9e468..5441b14994fccf46ebb6d6926186800b5ae11272 100644 (file)
@@ -76,7 +76,7 @@ vmlinux.gz: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $@
 
 unwcheck: vmlinux
-       -$(Q)READELF=$(READELF) python $(srctree)/arch/ia64/scripts/unwcheck.py $<
+       -$(Q)READELF=$(READELF) $(PYTHON) $(srctree)/arch/ia64/scripts/unwcheck.py $<
 
 archclean:
        $(Q)$(MAKE) $(clean)=$(boot)
index ef2aed0f63ca8c6eb9523495513b0fc008fa96fa..9dc52501de83adab2062049773d62cd451aac361 100644 (file)
@@ -112,10 +112,10 @@ int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
                r = zlib_inflate(&state->s, Z_FULL_FLUSH);
                if (r != Z_OK && r != Z_STREAM_END)
                        fatal("inflate returned %d msg: %s\n\r", r, state->s.msg);
-               len = state->s.next_out - (unsigned char *)dst;
+               len = state->s.next_out - (Byte *)dst;
        } else {
                /* uncompressed image */
-               len = min(state->s.avail_in, (unsigned)dstlen);
+               len = min(state->s.avail_in, (uLong)dstlen);
                memcpy(dst, state->s.next_in, len);
                state->s.next_in += len;
                state->s.avail_in -= len;
index 642e436d45954df70fee8a19a2306dec2bcf804b..daa5af91163c9bf01861eb44324630328fd4351c 100644 (file)
@@ -459,7 +459,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_POSSIBLE      \
            (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
             CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
-            CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | CPU_FTR_VSX)
+            CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
+            CPU_FTRS_PA6T | CPU_FTR_VSX)
 #endif
 #else
 enum {
@@ -509,7 +510,8 @@ enum {
 #define CPU_FTRS_ALWAYS                \
            (CPU_FTRS_POWER4 & CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \
             CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \
-            CPU_FTRS_PA6T & CPU_FTRS_POSSIBLE)
+            CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \
+            CPU_FTRS_POWER8_DD1 & CPU_FTRS_POSSIBLE)
 #endif
 #else
 enum {
index 44e90516519beedc607331528b3996f7f97bc39d..b125ceab149c0a8e428263093334d752ead4bdfa 100644 (file)
@@ -57,10 +57,10 @@ struct machdep_calls {
        void            (*hpte_removebolted)(unsigned long ea,
                                             int psize, int ssize);
        void            (*flush_hash_range)(unsigned long number, int local);
-       void            (*hugepage_invalidate)(struct mm_struct *mm,
+       void            (*hugepage_invalidate)(unsigned long vsid,
+                                              unsigned long addr,
                                               unsigned char *hpte_slot_array,
-                                              unsigned long addr, int psize);
-
+                                              int psize, int ssize);
        /* special for kexec, to be called in real mode, linear mapping is
         * destroyed as well */
        void            (*hpte_clear_all)(void);
index b2f8ce1fd0d742fc0b2f500feff4f965b255a09c..86055e598269ebc9c4ba2c52ea3b6f59a27599e2 100644 (file)
@@ -149,6 +149,8 @@ struct opal_sg_list {
 #define OPAL_DUMP_INFO2                                94
 #define OPAL_PCI_EEH_FREEZE_SET                        97
 #define OPAL_HANDLE_HMI                                98
+#define OPAL_REGISTER_DUMP_REGION              101
+#define OPAL_UNREGISTER_DUMP_REGION            102
 
 #ifndef __ASSEMBLY__
 
@@ -920,6 +922,8 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer,
                uint64_t length);
 int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data);
 int64_t opal_handle_hmi(void);
+int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
+int64_t opal_unregister_dump_region(uint32_t id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
@@ -974,6 +978,13 @@ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
                                             unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
 
+/*
+ * Dump region ID range usable by the OS
+ */
+#define OPAL_DUMP_REGION_HOST_START            0x80
+#define OPAL_DUMP_REGION_LOG_BUF               0x80
+#define OPAL_DUMP_REGION_HOST_END              0xFF
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_H */
index eb9261024f5192386dec97bff1ee10e62da4b7c3..7b3d54fae46f92a80dcaddf301a2ea2687beafba 100644 (file)
@@ -413,7 +413,7 @@ static inline char *get_hpte_slot_array(pmd_t *pmdp)
 }
 
 extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-                                  pmd_t *pmdp);
+                                  pmd_t *pmdp, unsigned long old_pmd);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
index b6d2d42f84b568d981a226f97e15d802b3dbb789..4f4ec2ab45c97b90c1de34d9403bd251e743d4e8 100644 (file)
  * in order to deal with 64K made of 4K HW pages. Thus we override the
  * generic accessors and iterators here
  */
-#define __real_pte(e,p)        ((real_pte_t) { \
-                       (e), (pte_val(e) & _PAGE_COMBO) ? \
-                               (pte_val(*((p) + PTRS_PER_PTE))) : 0 })
-#define __rpte_to_hidx(r,index)        ((pte_val((r).pte) & _PAGE_COMBO) ? \
-        (((r).hidx >> ((index)<<2)) & 0xf) : ((pte_val((r).pte) >> 12) & 0xf))
+#define __real_pte __real_pte
+static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
+{
+       real_pte_t rpte;
+
+       rpte.pte = pte;
+       rpte.hidx = 0;
+       if (pte_val(pte) & _PAGE_COMBO) {
+               /*
+                * Make sure we order the hidx load against the _PAGE_COMBO
+                * check. The store side ordering is done in __hash_page_4K
+                */
+               smp_rmb();
+               rpte.hidx = pte_val(*((ptep) + PTRS_PER_PTE));
+       }
+       return rpte;
+}
+
+static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
+{
+       if ((pte_val(rpte.pte) & _PAGE_COMBO))
+               return (rpte.hidx >> (index<<2)) & 0xf;
+       return (pte_val(rpte.pte) >> 12) & 0xf;
+}
+
 #define __rpte_to_pte(r)       ((r).pte)
 #define __rpte_sub_valid(rpte, index) \
        (pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
index 1c987bf794ef9454e359095a8db012443f913a45..0c0505956a296bd83a185042a04e4e096880a04c 100644 (file)
 #define SPRN_ACOP      0x1F    /* Available Coprocessor Register */
 #define SPRN_TFIAR     0x81    /* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR    0x82    /* Transaction EXception & Summary */
-#define   TEXASR_FS    __MASK(63-36)   /* Transaction Failure Summary */
 #define SPRN_TEXASRU   0x83    /* ''      ''      ''    Upper 32  */
-#define   TEXASR_FS     __MASK(63-36) /* TEXASR Failure Summary */
+#define   TEXASR_FS    __MASK(63-36) /* TEXASR Failure Summary */
 #define SPRN_TFHAR     0x80    /* Transaction Failure Handler Addr */
 #define SPRN_CTRLF     0x088
 #define SPRN_CTRLT     0x098
index 35aa339410bdaef7d50decc76bda487677323e78..4dbe072eecbefea4482d65af5291d2cf8c74e255 100644 (file)
@@ -61,6 +61,7 @@ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
+       smp_mb();
        return !arch_spin_value_unlocked(*lock);
 }
 
index 6144d5a6bfe75c365282792468002e156d8f3add..050f79a4a168cd16a1c4f22be4046cd06a5e6a85 100644 (file)
@@ -592,61 +592,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
        MASKABLE_EXCEPTION_HV_OOL(0xe62, hmi_exception)
        KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
-       .globl hmi_exception_early
-hmi_exception_early:
-       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
-       mr      r10,r1                  /* Save r1                      */
-       ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
-       subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
-       std     r9,_CCR(r1)             /* save CR in stackframe        */
-       mfspr   r11,SPRN_HSRR0          /* Save HSRR0 */
-       std     r11,_NIP(r1)            /* save HSRR0 in stackframe     */
-       mfspr   r12,SPRN_HSRR1          /* Save SRR1 */
-       std     r12,_MSR(r1)            /* save SRR1 in stackframe      */
-       std     r10,0(r1)               /* make stack chain pointer     */
-       std     r0,GPR0(r1)             /* save r0 in stackframe        */
-       std     r10,GPR1(r1)            /* save r1 in stackframe        */
-       EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
-       EXCEPTION_PROLOG_COMMON_3(0xe60)
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      hmi_exception_realmode
-       /* Windup the stack. */
-       /* Clear MSR_RI before setting SRR0 and SRR1. */
-       li      r0,MSR_RI
-       mfmsr   r9                      /* get MSR value */
-       andc    r9,r9,r0
-       mtmsrd  r9,1                    /* Clear MSR_RI */
-       /* Move original HSRR0 and HSRR1 into the respective regs */
-       ld      r9,_MSR(r1)
-       mtspr   SPRN_HSRR1,r9
-       ld      r3,_NIP(r1)
-       mtspr   SPRN_HSRR0,r3
-       ld      r9,_CTR(r1)
-       mtctr   r9
-       ld      r9,_XER(r1)
-       mtxer   r9
-       ld      r9,_LINK(r1)
-       mtlr    r9
-       REST_GPR(0, r1)
-       REST_8GPRS(2, r1)
-       REST_GPR(10, r1)
-       ld      r11,_CCR(r1)
-       mtcr    r11
-       REST_GPR(11, r1)
-       REST_2GPRS(12, r1)
-       /* restore original r1. */
-       ld      r1,GPR1(r1)
-
-       /*
-        * Go to virtual mode and pull the HMI event information from
-        * firmware.
-        */
-       .globl hmi_exception_after_realmode
-hmi_exception_after_realmode:
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       hmi_exception_hv
-
        MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
        KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
 
@@ -1306,6 +1251,61 @@ fwnmi_data_area:
        . = 0x8000
 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
+       .globl hmi_exception_early
+hmi_exception_early:
+       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+       mr      r10,r1                  /* Save r1                      */
+       ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
+       subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
+       std     r9,_CCR(r1)             /* save CR in stackframe        */
+       mfspr   r11,SPRN_HSRR0          /* Save HSRR0 */
+       std     r11,_NIP(r1)            /* save HSRR0 in stackframe     */
+       mfspr   r12,SPRN_HSRR1          /* Save SRR1 */
+       std     r12,_MSR(r1)            /* save SRR1 in stackframe      */
+       std     r10,0(r1)               /* make stack chain pointer     */
+       std     r0,GPR0(r1)             /* save r0 in stackframe        */
+       std     r10,GPR1(r1)            /* save r1 in stackframe        */
+       EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
+       EXCEPTION_PROLOG_COMMON_3(0xe60)
+       addi    r3,r1,STACK_FRAME_OVERHEAD
+       bl      hmi_exception_realmode
+       /* Windup the stack. */
+       /* Clear MSR_RI before setting SRR0 and SRR1. */
+       li      r0,MSR_RI
+       mfmsr   r9                      /* get MSR value */
+       andc    r9,r9,r0
+       mtmsrd  r9,1                    /* Clear MSR_RI */
+       /* Move original HSRR0 and HSRR1 into the respective regs */
+       ld      r9,_MSR(r1)
+       mtspr   SPRN_HSRR1,r9
+       ld      r3,_NIP(r1)
+       mtspr   SPRN_HSRR0,r3
+       ld      r9,_CTR(r1)
+       mtctr   r9
+       ld      r9,_XER(r1)
+       mtxer   r9
+       ld      r9,_LINK(r1)
+       mtlr    r9
+       REST_GPR(0, r1)
+       REST_8GPRS(2, r1)
+       REST_GPR(10, r1)
+       ld      r11,_CCR(r1)
+       mtcr    r11
+       REST_GPR(11, r1)
+       REST_2GPRS(12, r1)
+       /* restore original r1. */
+       ld      r1,GPR1(r1)
+
+       /*
+        * Go to virtual mode and pull the HMI event information from
+        * firmware.
+        */
+       .globl hmi_exception_after_realmode
+hmi_exception_after_realmode:
+       SET_SCRATCH0(r13)
+       EXCEPTION_PROLOG_0(PACA_EXGEN)
+       b       hmi_exception_hv
+
 #ifdef CONFIG_PPC_POWERNV
 _GLOBAL(opal_mc_secondary_handler)
        HMT_MEDIUM_PPR_DISCARD
index c334f53453f708ea64c7d9c1322cc92418a5b687..b5061abbd2e0c1df269f424905f56c5e429167b1 100644 (file)
@@ -1210,10 +1210,12 @@ clear_utlb_entry:
 
        /* We configure icbi to invalidate 128 bytes at a time since the
         * current 32-bit kernel code isn't too happy with icache != dcache
-        * block size
+        * block size. We also disable the BTAC as this can cause errors
+        * in some circumstances (see IBM Erratum 47).
         */
        mfspr   r3,SPRN_CCR0
        oris    r3,r3,0x0020
+       ori     r3,r3,0x0040
        mtspr   SPRN_CCR0,r3
        isync
 
index f84f799babb1271b566de19dc23ffeb0c251ddfe..a10642a0d861cd6a5a80cbe796b36e97d37f9df9 100644 (file)
@@ -1120,37 +1120,41 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
        struct iommu_table *tbl;
-       int ret = 0;
 
-       if (WARN_ON(dev->iommu_group)) {
-               pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
-                               dev_name(dev),
-                               iommu_group_id(dev->iommu_group));
+       /*
+        * The sysfs entries should be populated before
+        * binding IOMMU group. If sysfs entries isn't
+        * ready, we simply bail.
+        */
+       if (!device_is_registered(dev))
+               return -ENOENT;
+
+       if (dev->iommu_group) {
+               pr_debug("%s: Skipping device %s with iommu group %d\n",
+                        __func__, dev_name(dev),
+                        iommu_group_id(dev->iommu_group));
                return -EBUSY;
        }
 
        tbl = get_iommu_table_base(dev);
        if (!tbl || !tbl->it_group) {
-               pr_debug("iommu_tce: skipping device %s with no tbl\n",
-                               dev_name(dev));
+               pr_debug("%s: Skipping device %s with no tbl\n",
+                        __func__, dev_name(dev));
                return 0;
        }
 
-       pr_debug("iommu_tce: adding %s to iommu group %d\n",
-                       dev_name(dev), iommu_group_id(tbl->it_group));
+       pr_debug("%s: Adding %s to iommu group %d\n",
+                __func__, dev_name(dev),
+                iommu_group_id(tbl->it_group));
 
        if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
-               pr_err("iommu_tce: unsupported iommu page size.");
-               pr_err("%s has not been added\n", dev_name(dev));
+               pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
+                      __func__, IOMMU_PAGE_SIZE(tbl),
+                      PAGE_SIZE, dev_name(dev));
                return -EINVAL;
        }
 
-       ret = iommu_group_add_device(tbl->it_group, dev);
-       if (ret < 0)
-               pr_err("iommu_tce: %s has not been added, ret=%d\n",
-                               dev_name(dev), ret);
-
-       return ret;
+       return iommu_group_add_device(tbl->it_group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
index 1a3b1055f5ebaf423327401c71612b38c184f12d..4e139f8a69effa0a403a2e6d75b0fe7d7e268e3d 100644 (file)
@@ -818,76 +818,6 @@ int cpu_to_chip_id(int cpu)
 }
 EXPORT_SYMBOL(cpu_to_chip_id);
 
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Fix up the uninitialized fields in a new device node:
- * name, type and pci-specific fields
- */
-
-static int of_finish_dynamic_node(struct device_node *node)
-{
-       struct device_node *parent = of_get_parent(node);
-       int err = 0;
-       const phandle *ibm_phandle;
-
-       node->name = of_get_property(node, "name", NULL);
-       node->type = of_get_property(node, "device_type", NULL);
-
-       if (!node->name)
-               node->name = "<NULL>";
-       if (!node->type)
-               node->type = "<NULL>";
-
-       if (!parent) {
-               err = -ENODEV;
-               goto out;
-       }
-
-       /* We don't support that function on PowerMac, at least
-        * not yet
-        */
-       if (machine_is(powermac))
-               return -ENODEV;
-
-       /* fix up new node's phandle field */
-       if ((ibm_phandle = of_get_property(node, "ibm,phandle", NULL)))
-               node->phandle = *ibm_phandle;
-
-out:
-       of_node_put(parent);
-       return err;
-}
-
-static int prom_reconfig_notifier(struct notifier_block *nb,
-                                 unsigned long action, void *node)
-{
-       int err;
-
-       switch (action) {
-       case OF_RECONFIG_ATTACH_NODE:
-               err = of_finish_dynamic_node(node);
-               if (err < 0)
-                       printk(KERN_ERR "finish_node returned %d\n", err);
-               break;
-       default:
-               err = 0;
-               break;
-       }
-       return notifier_from_errno(err);
-}
-
-static struct notifier_block prom_reconfig_nb = {
-       .notifier_call = prom_reconfig_notifier,
-       .priority = 10, /* This one needs to run first */
-};
-
-static int __init prom_reconfig_setup(void)
-{
-       return of_reconfig_notifier_register(&prom_reconfig_nb);
-}
-__initcall(prom_reconfig_setup);
-#endif
-
 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 {
        return (int)phys_id == get_hard_smp_processor_id(cpu);
index 1007fb802e6b0436ac16a2595720fa505fb3d415..a0738af4aba6b80b3d356935d3bcf5c3b61a4f48 100644 (file)
@@ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
                                        GFP_KERNEL, cpu_to_node(cpu));
                zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
                                        GFP_KERNEL, cpu_to_node(cpu));
+               /*
+                * numa_node_id() works after this.
+                */
+               set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
+               set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu]));
        }
 
        cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
@@ -723,12 +728,6 @@ void start_secondary(void *unused)
        }
        traverse_core_siblings(cpu, true);
 
-       /*
-        * numa_node_id() works after this.
-        */
-       set_numa_node(numa_cpu_lookup_table[cpu]);
-       set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
-
        smp_wmb();
        notify_cpu_starting(cpu);
        set_cpu_online(cpu, true);
index 0c9c8d7d07345cd2f5def8c2801b466c8330a48b..170a0346f7561ff345ec9faa57fe4e55af9d9d46 100644 (file)
@@ -70,12 +70,16 @@ void __rw_yield(arch_rwlock_t *rw)
 
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
+       smp_mb();
+
        while (lock->slock) {
                HMT_low();
                if (SHARED_PROCESSOR)
                        __spin_yield(lock);
        }
        HMT_medium();
+
+       smp_mb();
 }
 
 EXPORT_SYMBOL(arch_spin_unlock_wait);
index cf1d325eae8be814953650cf6b94fd349c0fdd12..afc0a8295f84c7097217855fae59f62b1ed6149e 100644 (file)
@@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
        local_irq_restore(flags);
 }
 
-static void native_hugepage_invalidate(struct mm_struct *mm,
+static void native_hugepage_invalidate(unsigned long vsid,
+                                      unsigned long addr,
                                       unsigned char *hpte_slot_array,
-                                      unsigned long addr, int psize)
+                                      int psize, int ssize)
 {
-       int ssize = 0, i;
-       int lock_tlbie;
+       int i;
        struct hash_pte *hptep;
        int actual_psize = MMU_PAGE_16M;
        unsigned int max_hpte_count, valid;
        unsigned long flags, s_addr = addr;
        unsigned long hpte_v, want_v, shift;
-       unsigned long hidx, vpn = 0, vsid, hash, slot;
+       unsigned long hidx, vpn = 0, hash, slot;
 
        shift = mmu_psize_defs[psize].shift;
        max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
 
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
-               if (!is_kernel_addr(addr)) {
-                       ssize = user_segment_size(addr);
-                       vsid = get_vsid(mm->context.id, addr, ssize);
-                       WARN_ON(vsid == 0);
-               } else {
-                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-                       ssize = mmu_kernel_ssize;
-               }
-
                vpn = hpt_vpn(addr, vsid, ssize);
                hash = hpt_hash(vpn, shift, ssize);
                if (hidx & _PTEIDX_SECONDARY)
@@ -465,22 +456,13 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
                else
                        /* Invalidate the hpte. NOTE: this also unlocks it */
                        hptep->v = 0;
+               /*
+                * We need to do tlb invalidate for all the address, tlbie
+                * instruction compares entry_VA in tlb with the VA specified
+                * here
+                */
+               tlbie(vpn, psize, actual_psize, ssize, 0);
        }
-       /*
-        * Since this is a hugepage, we just need a single tlbie.
-        * use the last vpn.
-        */
-       lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-       if (lock_tlbie)
-               raw_spin_lock(&native_tlbie_lock);
-
-       asm volatile("ptesync":::"memory");
-       __tlbie(vpn, psize, actual_psize, ssize);
-       asm volatile("eieio; tlbsync; ptesync":::"memory");
-
-       if (lock_tlbie)
-               raw_spin_unlock(&native_tlbie_lock);
-
        local_irq_restore(flags);
 }
 
index 826893fcb3a78b0198d5fa4b0b047428d8852802..5f5e6328c21c10f5a158d84f737ace3250602d61 100644 (file)
 #include <linux/mm.h>
 #include <asm/machdep.h>
 
+static void invalidate_old_hpte(unsigned long vsid, unsigned long addr,
+                               pmd_t *pmdp, unsigned int psize, int ssize)
+{
+       int i, max_hpte_count, valid;
+       unsigned long s_addr;
+       unsigned char *hpte_slot_array;
+       unsigned long hidx, shift, vpn, hash, slot;
+
+       s_addr = addr & HPAGE_PMD_MASK;
+       hpte_slot_array = get_hpte_slot_array(pmdp);
+       /*
+        * IF we try to do a HUGE PTE update after a withdraw is done.
+        * we will find the below NULL. This happens when we do
+        * split_huge_page_pmd
+        */
+       if (!hpte_slot_array)
+               return;
+
+       if (ppc_md.hugepage_invalidate)
+               return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+                                                 psize, ssize);
+       /*
+        * No bluk hpte removal support, invalidate each entry
+        */
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = HPAGE_PMD_SIZE >> shift;
+       for (i = 0; i < max_hpte_count; i++) {
+               /*
+                * 8 bits per each hpte entries
+                * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                */
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+               ppc_md.hpte_invalidate(slot, vpn, psize,
+                                      MMU_PAGE_16M, ssize, 0);
+       }
+}
+
+
 int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                    pmd_t *pmdp, unsigned long trap, int local, int ssize,
                    unsigned int psize)
@@ -33,7 +84,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
         * atomically mark the linux large page PMD busy and dirty
         */
        do {
-               old_pmd = pmd_val(*pmdp);
+               pmd_t pmd = ACCESS_ONCE(*pmdp);
+
+               old_pmd = pmd_val(pmd);
                /* If PMD busy, retry the access */
                if (unlikely(old_pmd & _PAGE_BUSY))
                        return 0;
@@ -85,6 +138,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
        vpn = hpt_vpn(ea, vsid, ssize);
        hash = hpt_hash(vpn, shift, ssize);
        hpte_slot_array = get_hpte_slot_array(pmdp);
+       if (psize == MMU_PAGE_4K) {
+               /*
+                * invalidate the old hpte entry if we have that mapped via 64K
+                * base page size. This is because demote_segment won't flush
+                * hash page table entries.
+                */
+               if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
+                       invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize);
+       }
 
        valid = hpte_valid(hpte_slot_array, index);
        if (valid) {
@@ -107,11 +169,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                         * safely update this here.
                         */
                        valid = 0;
-                       new_pmd &= ~_PAGE_HPTEFLAGS;
                        hpte_slot_array[index] = 0;
-               } else
-                       /* clear the busy bits and set the hash pte bits */
-                       new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+               }
        }
 
        if (!valid) {
@@ -119,11 +178,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 
                /* insert new entry */
                pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-repeat:
-               hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-
-               /* clear the busy bits and set the hash pte bits */
-               new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+               new_pmd |= _PAGE_HASHPTE;
 
                /* Add in WIMG bits */
                rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
@@ -132,6 +187,8 @@ repeat:
                 * enable the memory coherence always
                 */
                rflags |= HPTE_R_M;
+repeat:
+               hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
                /* Insert into the hash table, primary slot */
                slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -172,8 +229,17 @@ repeat:
                mark_hpte_slot_valid(hpte_slot_array, index, slot);
        }
        /*
-        * No need to use ldarx/stdcx here
+        * Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+        * base page size 4k.
+        */
+       if (psize == MMU_PAGE_4K)
+               new_pmd |= _PAGE_COMBO;
+       /*
+        * The hpte valid is stored in the pgtable whose address is in the
+        * second half of the PMD. Order this against clearing of the busy bit in
+        * huge pmd.
         */
+       smp_wmb();
        *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
        return 0;
 }
index d3e9a78eaed3e1b6f073a9c266b2e6c65b13073b..d7737a542fd7d5f5af82bde3ec417262575175af 100644 (file)
@@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
 
 void __init do_init_bootmem(void)
 {
-       int nid;
+       int nid, cpu;
 
        min_low_pfn = 0;
        max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
@@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void)
 
        reset_numa_cpu_lookup_table();
        register_cpu_notifier(&ppc64_numa_nb);
-       cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-                         (void *)(unsigned long)boot_cpuid);
+       /*
+        * We need the numa_cpu_lookup_table to be accurate for all CPUs,
+        * even before we online them, so that we can use cpu_to_{node,mem}
+        * early in boot, cf. smp_prepare_cpus().
+        */
+       for_each_possible_cpu(cpu) {
+               cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+                                 (void *)(unsigned long)cpu);
+       }
 }
 
 void __init paging_init(void)
index 3b3c4d34c7a0ec48c08d037faa4260509541d6d6..c8d709ab489d0ab5714fc35f0e7be2d542e2e472 100644 (file)
@@ -54,6 +54,9 @@
 
 #include "mmu_decl.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
 /* Some sanity checking */
 #if TASK_SIZE_USER64 > PGTABLE_RANGE
 #error TASK_SIZE_USER64 exceeds pagetable range
@@ -537,8 +540,9 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
        old = pmd_val(*pmdp);
        *pmdp = __pmd((old & ~clr) | set);
 #endif
+       trace_hugepage_update(addr, old, clr, set);
        if (old & _PAGE_HASHPTE)
-               hpte_do_hugepage_flush(mm, addr, pmdp);
+               hpte_do_hugepage_flush(mm, addr, pmdp, old);
        return old;
 }
 
@@ -642,10 +646,11 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
         * If we didn't had the splitting flag set, go and flush the
         * HPTE entries.
         */
+       trace_hugepage_splitting(address, old);
        if (!(old & _PAGE_SPLITTING)) {
                /* We need to flush the hpte */
                if (old & _PAGE_HASHPTE)
-                       hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+                       hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
        }
        /*
         * This ensures that generic code that rely on IRQ disabling
@@ -709,6 +714,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        assert_spin_locked(&mm->page_table_lock);
        WARN_ON(!pmd_trans_huge(pmd));
 #endif
+       trace_hugepage_set_pmd(addr, pmd);
        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
 
@@ -723,7 +729,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  * neesd to be flushed.
  */
 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-                           pmd_t *pmdp)
+                           pmd_t *pmdp, unsigned long old_pmd)
 {
        int ssize, i;
        unsigned long s_addr;
@@ -745,12 +751,29 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
        if (!hpte_slot_array)
                return;
 
-       /* get the base page size */
+       /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
        psize = get_slice_psize(mm, s_addr);
+       BUG_ON(psize == MMU_PAGE_16M);
+#endif
+       if (old_pmd & _PAGE_COMBO)
+               psize = MMU_PAGE_4K;
+       else
+               psize = MMU_PAGE_64K;
+
+       if (!is_kernel_addr(s_addr)) {
+               ssize = user_segment_size(s_addr);
+               vsid = get_vsid(mm->context.id, s_addr, ssize);
+               WARN_ON(vsid == 0);
+       } else {
+               vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
+               ssize = mmu_kernel_ssize;
+       }
 
        if (ppc_md.hugepage_invalidate)
-               return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
-                                                 s_addr, psize);
+               return ppc_md.hugepage_invalidate(vsid, s_addr,
+                                                 hpte_slot_array,
+                                                 psize, ssize);
        /*
         * No bluk hpte removal support, invalidate each entry
         */
@@ -768,15 +791,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
-               if (!is_kernel_addr(addr)) {
-                       ssize = user_segment_size(addr);
-                       vsid = get_vsid(mm->context.id, addr, ssize);
-                       WARN_ON(vsid == 0);
-               } else {
-                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-                       ssize = mmu_kernel_ssize;
-               }
-
                vpn = hpt_vpn(addr, vsid, ssize);
                hash = hpt_hash(vpn, shift, ssize);
                if (hidx & _PTEIDX_SECONDARY)
index c99f6510a0b267220253249ed836e3dee4327ba7..d2a94b85dbc2453ef2d4aa525dff05e7c6e7237c 100644 (file)
@@ -30,6 +30,8 @@
 #include <asm/tlb.h>
 #include <asm/bug.h>
 
+#include <trace/events/thp.h>
+
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
 /*
@@ -213,10 +215,12 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                if (ptep == NULL)
                        continue;
                pte = pte_val(*ptep);
+               if (hugepage_shift)
+                       trace_hugepage_invalidate(start, pte_val(pte));
                if (!(pte & _PAGE_HASHPTE))
                        continue;
                if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
-                       hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+                       hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
                else
                        hpte_need_flush(mm, start, ptep, pte, 0);
        }
index 92cb18d52ea8d147ed1201d56eec2d776e28e978..f38ea4df6a8556f5f84f50ab1a37798624036b4f 100644 (file)
@@ -581,42 +581,10 @@ static void setup_mmu_htw(void)
 /*
  * Early initialization of the MMU TLB code
  */
-static void __early_init_mmu(int boot_cpu)
+static void early_init_this_mmu(void)
 {
        unsigned int mas4;
 
-       /* XXX This will have to be decided at runtime, but right
-        * now our boot and TLB miss code hard wires it. Ideally
-        * we should find out a suitable page size and patch the
-        * TLB miss code (either that or use the PACA to store
-        * the value we want)
-        */
-       mmu_linear_psize = MMU_PAGE_1G;
-
-       /* XXX This should be decided at runtime based on supported
-        * page sizes in the TLB, but for now let's assume 16M is
-        * always there and a good fit (which it probably is)
-        *
-        * Freescale booke only supports 4K pages in TLB0, so use that.
-        */
-       if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-               mmu_vmemmap_psize = MMU_PAGE_4K;
-       else
-               mmu_vmemmap_psize = MMU_PAGE_16M;
-
-       /* XXX This code only checks for TLB 0 capabilities and doesn't
-        *     check what page size combos are supported by the HW. It
-        *     also doesn't handle the case where a separate array holds
-        *     the IND entries from the array loaded by the PT.
-        */
-       if (boot_cpu) {
-               /* Look for supported page sizes */
-               setup_page_sizes();
-
-               /* Look for HW tablewalk support */
-               setup_mmu_htw();
-       }
-
        /* Set MAS4 based on page table setting */
 
        mas4 = 0x4 << MAS4_WIMGED_SHIFT;
@@ -650,11 +618,6 @@ static void __early_init_mmu(int boot_cpu)
        }
        mtspr(SPRN_MAS4, mas4);
 
-       /* Set the global containing the top of the linear mapping
-        * for use by the TLB miss code
-        */
-       linear_map_top = memblock_end_of_DRAM();
-
 #ifdef CONFIG_PPC_FSL_BOOK3E
        if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
                unsigned int num_cams;
@@ -662,10 +625,49 @@ static void __early_init_mmu(int boot_cpu)
                /* use a quarter of the TLBCAM for bolted linear map */
                num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
                linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+       }
+#endif
 
-               /* limit memory so we dont have linear faults */
-               memblock_enforce_memory_limit(linear_map_top);
+       /* A sync won't hurt us after mucking around with
+        * the MMU configuration
+        */
+       mb();
+}
 
+static void __init early_init_mmu_global(void)
+{
+       /* XXX This will have to be decided at runtime, but right
+        * now our boot and TLB miss code hard wires it. Ideally
+        * we should find out a suitable page size and patch the
+        * TLB miss code (either that or use the PACA to store
+        * the value we want)
+        */
+       mmu_linear_psize = MMU_PAGE_1G;
+
+       /* XXX This should be decided at runtime based on supported
+        * page sizes in the TLB, but for now let's assume 16M is
+        * always there and a good fit (which it probably is)
+        *
+        * Freescale booke only supports 4K pages in TLB0, so use that.
+        */
+       if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+               mmu_vmemmap_psize = MMU_PAGE_4K;
+       else
+               mmu_vmemmap_psize = MMU_PAGE_16M;
+
+       /* XXX This code only checks for TLB 0 capabilities and doesn't
+        *     check what page size combos are supported by the HW. It
+        *     also doesn't handle the case where a separate array holds
+        *     the IND entries from the array loaded by the PT.
+        */
+       /* Look for supported page sizes */
+       setup_page_sizes();
+
+       /* Look for HW tablewalk support */
+       setup_mmu_htw();
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+       if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
                if (book3e_htw_mode == PPC_HTW_NONE) {
                        extlb_level_exc = EX_TLB_SIZE;
                        patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
@@ -675,22 +677,41 @@ static void __early_init_mmu(int boot_cpu)
        }
 #endif
 
-       /* A sync won't hurt us after mucking around with
-        * the MMU configuration
+       /* Set the global containing the top of the linear mapping
+        * for use by the TLB miss code
         */
-       mb();
+       linear_map_top = memblock_end_of_DRAM();
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+#ifdef CONFIG_PPC_FSL_BOOK3E
+       if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+               /*
+                * Limit memory so we dont have linear faults.
+                * Unlike memblock_set_current_limit, which limits
+                * memory available during early boot, this permanently
+                * reduces the memory available to Linux.  We need to
+                * do this because highmem is not supported on 64-bit.
+                */
+               memblock_enforce_memory_limit(linear_map_top);
+       }
+#endif
 
        memblock_set_current_limit(linear_map_top);
 }
 
+/* boot cpu only */
 void __init early_init_mmu(void)
 {
-       __early_init_mmu(1);
+       early_init_mmu_global();
+       early_init_this_mmu();
+       early_mmu_set_memory_limit();
 }
 
 void early_init_mmu_secondary(void)
 {
-       __early_init_mmu(0);
+       early_init_this_mmu();
 }
 
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
index 66d0f179650f4834930c9f6417952a42143e1887..70d4f748b54bead98a8889ea3de6d344dbb3ca31 100644 (file)
@@ -223,7 +223,7 @@ e_free:
                pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
                       " rc=%ld\n",
                       catalog_version_num, page_offset, hret);
-       kfree(page);
+       kmem_cache_free(hv_page_cache, page);
 
        pr_devel("catalog_read: offset=%lld(%lld) count=%zu(%zu) catalog_len=%zu(%zu) => %zd\n",
                        offset, page_offset, count, page_count, catalog_len,
index 1413e72bc2e1489d38f819041fe476f9ad60c2ef..4882bfd90e27c820b42db5307a08d4c271f48ded 100644 (file)
@@ -2805,25 +2805,20 @@ set_initial_features(void)
                /* Enable GMAC for now for PCI probing. It will be disabled
                 * later on after PCI probe
                 */
-               np = of_find_node_by_name(NULL, "ethernet");
-               while(np) {
+               for_each_node_by_name(np, "ethernet")
                        if (of_device_is_compatible(np, "K2-GMAC"))
                                g5_gmac_enable(np, 0, 1);
-                       np = of_find_node_by_name(np, "ethernet");
-               }
 
                /* Enable FW before PCI probe. Will be disabled later on
                 * Note: We should have a batter way to check that we are
                 * dealing with uninorth internal cell and not a PCI cell
                 * on the external PCI. The code below works though.
                 */
-               np = of_find_node_by_name(NULL, "firewire");
-               while(np) {
+               for_each_node_by_name(np, "firewire") {
                        if (of_device_is_compatible(np, "pci106b,5811")) {
                                macio_chips[0].flags |= MACIO_FLAG_FW_SUPPORTED;
                                g5_fw_enable(np, 0, 1);
                        }
-                       np = of_find_node_by_name(np, "firewire");
                }
        }
 #else /* CONFIG_PPC64 */
@@ -2834,13 +2829,11 @@ set_initial_features(void)
                /* Enable GMAC for now for PCI probing. It will be disabled
                 * later on after PCI probe
                 */
-               np = of_find_node_by_name(NULL, "ethernet");
-               while(np) {
+               for_each_node_by_name(np, "ethernet") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && of_device_is_compatible(np, "gmac"))
                                core99_gmac_enable(np, 0, 1);
-                       np = of_find_node_by_name(np, "ethernet");
                }
 
                /* Enable FW before PCI probe. Will be disabled later on
@@ -2848,8 +2841,7 @@ set_initial_features(void)
                 * dealing with uninorth internal cell and not a PCI cell
                 * on the external PCI. The code below works though.
                 */
-               np = of_find_node_by_name(NULL, "firewire");
-               while(np) {
+               for_each_node_by_name(np, "firewire") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && (of_device_is_compatible(np, "pci106b,18") ||
@@ -2858,18 +2850,16 @@ set_initial_features(void)
                                macio_chips[0].flags |= MACIO_FLAG_FW_SUPPORTED;
                                core99_firewire_enable(np, 0, 1);
                        }
-                       np = of_find_node_by_name(np, "firewire");
                }
 
                /* Enable ATA-100 before PCI probe. */
                np = of_find_node_by_name(NULL, "ata-6");
-               while(np) {
+               for_each_node_by_name(np, "ata-6") {
                        if (np->parent
                            && of_device_is_compatible(np->parent, "uni-north")
                            && of_device_is_compatible(np, "kauai-ata")) {
                                core99_ata100_enable(np, 1);
                        }
-                       np = of_find_node_by_name(np, "ata-6");
                }
 
                /* Switch airport off */
index cf7009b8c7b6481b6c36339e3c0915809f2e0325..7e868ccf3b0d9451ad7efdf0cf0a6f01bb7b4f7a 100644 (file)
@@ -698,7 +698,7 @@ static void __init fixup_nec_usb2(void)
 {
        struct device_node *nec;
 
-       for (nec = NULL; (nec = of_find_node_by_name(nec, "usb")) != NULL;) {
+       for_each_node_by_name(nec, "usb") {
                struct pci_controller *hose;
                u32 data;
                const u32 *prop;
index 5cbd4d67d5c445af65be43099477df712193755b..af094ae03dbbc9dfddec65e69da279d3ab8a0b7f 100644 (file)
@@ -577,7 +577,7 @@ static void __init smp_core99_setup_i2c_hwsync(int ncpus)
        int ok;
 
        /* Look for the clock chip */
-       while ((cc = of_find_node_by_name(cc, "i2c-hwclock")) != NULL) {
+       for_each_node_by_name(cc, "i2c-hwclock") {
                p = of_get_parent(cc);
                ok = p && of_device_is_compatible(p, "uni-n-i2c");
                of_node_put(p);
index 44e0b55a2a028f4227ae0b22f31791f1ac913218..366bd221edecb089c9b887ba62b271c90694719f 100644 (file)
@@ -191,7 +191,7 @@ int __init udbg_adb_init(int force_btext)
         * of type "adb". If not, we return a failure, but we keep the
         * bext output set for now
         */
-       for (np = NULL; (np = of_find_node_by_name(np, "keyboard")) != NULL;) {
+       for_each_node_by_name(np, "keyboard") {
                struct device_node *parent = of_get_parent(np);
                int found = (parent && strcmp(parent->type, "adb") == 0);
                of_node_put(parent);
index a328be44880f850e39a699d3c11ab30d586bf448..2e6ce1b8dc8ffcdd0d21cd86034c9e86e745880b 100644 (file)
@@ -245,3 +245,5 @@ OPAL_CALL(opal_sensor_read,                 OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,                      OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,                      OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,                     OPAL_HANDLE_HMI);
+OPAL_CALL(opal_register_dump_region,           OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region,         OPAL_UNREGISTER_DUMP_REGION);
index f0a01a46a57d12b33f3ec2f6fe853ec59b4a0b64..b44eec3e8dbd8dbfb43873af73f006d20081b314 100644 (file)
@@ -605,6 +605,24 @@ static int opal_sysfs_init(void)
        return 0;
 }
 
+static void __init opal_dump_region_init(void)
+{
+       void *addr;
+       uint64_t size;
+       int rc;
+
+       /* Register kernel log buffer */
+       addr = log_buf_addr_get();
+       size = log_buf_len_get();
+       rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
+                                      __pa(addr), size);
+       /* Don't warn if this is just an older OPAL that doesn't
+        * know about that call
+        */
+       if (rc && rc != OPAL_UNSUPPORTED)
+               pr_warn("DUMP: Failed to register kernel log buffer. "
+                       "rc = %d\n", rc);
+}
 static int __init opal_init(void)
 {
        struct device_node *np, *consoles;
@@ -654,6 +672,8 @@ static int __init opal_init(void)
        /* Create "opal" kobject under /sys/firmware */
        rc = opal_sysfs_init();
        if (rc == 0) {
+               /* Setup dump region interface */
+               opal_dump_region_init();
                /* Setup error log interface */
                rc = opal_elog_init();
                /* Setup code update interface */
@@ -694,6 +714,9 @@ void opal_shutdown(void)
                else
                        mdelay(10);
        }
+
+       /* Unregister memory dump region */
+       opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
 }
 
 /* Export this so that test modules can use it */
index b136108ddc9910a23af7efff65223dbbf81da2b6..df241b11d4f7d346f4a85c0632668677a9f3e07a 100644 (file)
@@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
        pe = &phb->ioda.pe_array[pdn->pe_number];
        WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-       set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+       set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
index 7995135170a31a35a7824703f72fe26b93b145a7..c904583baf4b94d46e5a0ae3b4f5dca134a4bc87 100644 (file)
@@ -146,7 +146,7 @@ static inline int pseries_remove_memblock(unsigned long base,
 }
 static inline int pseries_remove_mem_node(struct device_node *np)
 {
-       return -EOPNOTSUPP;
+       return 0;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
@@ -194,7 +194,7 @@ static int pseries_update_drconf_memory(struct of_prop_reconfig *pr)
        if (!memblock_size)
                return -EINVAL;
 
-       p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL);
+       p = (u32 *) pr->old_prop->value;
        if (!p)
                return -EINVAL;
 
index 4557e91626c43bd58d95a1730456557eab3aee2f..eedb64594dc560ee585e13a7fb4e411132cd84b1 100644 (file)
@@ -163,8 +163,8 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
                        return retval;
                }
 
-               last_p_partition_ID = pi_buff[0];
-               last_p_unit_address = pi_buff[1];
+               last_p_partition_ID = be64_to_cpu(pi_buff[0]);
+               last_p_unit_address = be64_to_cpu(pi_buff[1]);
 
                /* This indicates that there are no further partners */
                if (last_p_partition_ID == ~0UL
index 33b552ffbe576d4b452f31aa22af5b6a16a9cd0c..4642d6a4d35641d5219a2378ce917409cea4c850 100644 (file)
@@ -721,13 +721,13 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np)
+static void remove_ddw(struct device_node *np, bool remove_prop)
 {
        struct dynamic_dma_window_prop *dwp;
        struct property *win64;
        const u32 *ddw_avail;
        u64 liobn;
-       int len, ret;
+       int len, ret = 0;
 
        ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
        win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
@@ -761,7 +761,8 @@ static void remove_ddw(struct device_node *np)
                        np->full_name, ret, ddw_avail[2], liobn);
 
 delprop:
-       ret = of_remove_property(np, win64);
+       if (remove_prop)
+               ret = of_remove_property(np, win64);
        if (ret)
                pr_warning("%s: failed to remove direct window property: %d\n",
                        np->full_name, ret);
@@ -805,7 +806,7 @@ static int find_existing_ddw_windows(void)
                window = kzalloc(sizeof(*window), GFP_KERNEL);
                if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
                        kfree(window);
-                       remove_ddw(pdn);
+                       remove_ddw(pdn, true);
                        continue;
                }
 
@@ -1045,7 +1046,7 @@ out_free_window:
        kfree(window);
 
 out_clear_window:
-       remove_ddw(pdn);
+       remove_ddw(pdn, true);
 
 out_free_prop:
        kfree(win64->name);
@@ -1255,7 +1256,14 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 
        switch (action) {
        case OF_RECONFIG_DETACH_NODE:
-               remove_ddw(np);
+               /*
+                * Removing the property will invoke the reconfig
+                * notifier again, which causes dead-lock on the
+                * read-write semaphore of the notifier chain. So
+                * we have to remove the property when releasing
+                * the device node.
+                */
+               remove_ddw(np, false);
                if (pci && pci->iommu_table)
                        iommu_free_table(pci->iommu_table, np->full_name);
 
index fbfcef514aa742e8378d2bd959bdfee9cb8d9d98..34e64237fff9a9ca4cf6d62751e99feff41a3f1e 100644 (file)
@@ -431,16 +431,17 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
                spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
 }
 
-static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
-                                      unsigned char *hpte_slot_array,
-                                      unsigned long addr, int psize)
+static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
+                                            unsigned long addr,
+                                            unsigned char *hpte_slot_array,
+                                            int psize, int ssize)
 {
-       int ssize = 0, i, index = 0;
+       int i, index = 0;
        unsigned long s_addr = addr;
        unsigned int max_hpte_count, valid;
        unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
        unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
-       unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+       unsigned long shift, hidx, vpn = 0, hash, slot;
 
        shift = mmu_psize_defs[psize].shift;
        max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -453,15 +454,6 @@ static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
 
                /* get the vpn */
                addr = s_addr + (i * (1ul << shift));
-               if (!is_kernel_addr(addr)) {
-                       ssize = user_segment_size(addr);
-                       vsid = get_vsid(mm->context.id, addr, ssize);
-                       WARN_ON(vsid == 0);
-               } else {
-                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-                       ssize = mmu_kernel_ssize;
-               }
-
                vpn = hpt_vpn(addr, vsid, ssize);
                hash = hpt_hash(vpn, shift, ssize);
                if (hidx & _PTEIDX_SECONDARY)
index cfe8a6389a513a29b5b49fc720d9682430eebcac..e724d3186e739999cc6daf726d073fe035d26739 100644 (file)
@@ -232,8 +232,7 @@ static void __init pseries_discover_pic(void)
        struct device_node *np;
        const char *typep;
 
-       for (np = NULL; (np = of_find_node_by_name(np,
-                                                  "interrupt-controller"));) {
+       for_each_node_by_name(np, "interrupt-controller") {
                typep = of_get_property(np, "compatible", NULL);
                if (strstr(typep, "open-pic")) {
                        pSeries_mpic_node = of_node_get(np);
index 8d198b5e9e0a72813cc2beb702826e1a52072a4f..b988b5addf864a581ff8c36e177379c32ba92518 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bug.h>
+#include <linux/nmi.h>
 
 #include <asm/ptrace.h>
 #include <asm/string.h>
@@ -374,6 +375,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #endif
 
        local_irq_save(flags);
+       hard_irq_disable();
 
        bp = in_breakpoint_table(regs->nip, &offset);
        if (bp != NULL) {
@@ -558,6 +560,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #endif
        insert_cpu_bpts();
 
+       touch_nmi_watchdog();
        local_irq_restore(flags);
 
        return cmd != 'X' && cmd != EOF;
index c0413046483ae9862c1d487348b3a701a2045e3a..1580e7a5a4cf7600d0f424ddd5a66d76efef6b2f 100644 (file)
@@ -118,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void)
 {
        struct page **pages;
        xen_pfn_t *pfns;
+       void *vaddr;
        int rc;
        unsigned int i;
        unsigned long nr_grant_frames = gnttab_max_grant_frames();
@@ -143,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void)
        for (i = 0; i < nr_grant_frames; i++)
                pfns[i] = page_to_pfn(pages[i]);
 
-       rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
-                                   &xen_auto_xlat_grant_frames.vaddr);
-
-       if (rc) {
+       vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL);
+       if (!vaddr) {
                pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
                        nr_grant_frames, rc);
                free_xenballooned_pages(nr_grant_frames, pages);
                kfree(pages);
                kfree(pfns);
-               return rc;
+               return -ENOMEM;
        }
        kfree(pages);
 
        xen_auto_xlat_grant_frames.pfn = pfns;
        xen_auto_xlat_grant_frames.count = nr_grant_frames;
+       xen_auto_xlat_grant_frames.vaddr = vaddr;
 
        return 0;
 }
index 7b78f88c1707b994b34f20408bc1b6046447eb22..5718b0b58b60f663a845207af02d930962e96b8f 100644 (file)
@@ -444,7 +444,7 @@ void xen_setup_timer(int cpu)
 
        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
                                      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
-                                     IRQF_FORCE_RESUME,
+                                     IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
                                      name, NULL);
        (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 
index 9e241063a616f2c4ad023a08c8b6b06d676adb56..bc423f7b02da856a6987089b6750322c9b7cbfe4 100644 (file)
@@ -70,8 +70,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
                                          bs->bvec_integrity_pool);
                if (!bip->bip_vec)
                        goto err;
+               bip->bip_max_vcnt = bvec_nr_vecs(idx);
        } else {
                bip->bip_vec = bip->bip_inline_vecs;
+               bip->bip_max_vcnt = inline_vecs;
        }
 
        bip->bip_slab = idx;
@@ -114,14 +116,6 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
-       if (bip->bip_slab == BIO_POOL_NONE)
-               return BIP_INLINE_VECS;
-
-       return bvec_nr_vecs(bip->bip_slab);
-}
-
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:       bio to update
@@ -137,7 +131,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
 
-       if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
+       if (bip->bip_vcnt >= bip->bip_max_vcnt) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
index 0ec61c9e536c2032778db165c7b31d98bdc5e735..3e6331d25d90c6aa507acd43bb90f2001c9e6641 100644 (file)
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        bslab = &bio_slabs[entry];
 
        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-       slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+       slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+                                SLAB_HWCACHE_ALIGN, NULL);
        if (!slab)
                goto out_unlock;
 
index 6f8dba161bfe1fbc50ee9d1091bd1ff6513e0aad..c359d72e9d76f24a44b7c4b6b8c36c6677436191 100644 (file)
@@ -438,14 +438,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
-       bool drain;
-
        spin_lock_irq(q->queue_lock);
-       drain = !q->bypass_depth++;
+       q->bypass_depth++;
        queue_flag_set(QUEUE_FLAG_BYPASS, q);
        spin_unlock_irq(q->queue_lock);
 
-       if (drain) {
+       /*
+        * Queues start drained.  Skip actual draining till init is
+        * complete.  This avoids lenghty delays during queue init which
+        * can happen many times during boot.
+        */
+       if (blk_queue_init_done(q)) {
                spin_lock_irq(q->queue_lock);
                __blk_drain_queue(q, false);
                spin_unlock_irq(q->queue_lock);
@@ -511,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q)
         * prevent that q->request_fn() gets invoked after draining finished.
         */
        if (q->mq_ops) {
-               blk_mq_drain_queue(q);
+               blk_mq_freeze_queue(q);
                spin_lock_irq(lock);
        } else {
                spin_lock_irq(lock);
index ad69ef657e850cc79c6379667c0c501f9400a551..5189cb1e478a6b283609006364c01f9f8e31d082 100644 (file)
@@ -78,68 +78,47 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
 static int blk_mq_queue_enter(struct request_queue *q)
 {
-       int ret;
-
-       __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-       smp_wmb();
-
-       /* we have problems freezing the queue if it's initializing */
-       if (!blk_queue_dying(q) &&
-           (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
-               return 0;
-
-       __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+       while (true) {
+               int ret;
 
-       spin_lock_irq(q->queue_lock);
-       ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
-               !blk_queue_bypass(q) || blk_queue_dying(q),
-               *q->queue_lock);
-       /* inc usage with lock hold to avoid freeze_queue runs here */
-       if (!ret && !blk_queue_dying(q))
-               __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-       else if (blk_queue_dying(q))
-               ret = -ENODEV;
-       spin_unlock_irq(q->queue_lock);
+               if (percpu_ref_tryget_live(&q->mq_usage_counter))
+                       return 0;
 
-       return ret;
+               ret = wait_event_interruptible(q->mq_freeze_wq,
+                               !q->mq_freeze_depth || blk_queue_dying(q));
+               if (blk_queue_dying(q))
+                       return -ENODEV;
+               if (ret)
+                       return ret;
+       }
 }
 
 static void blk_mq_queue_exit(struct request_queue *q)
 {
-       __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+       percpu_ref_put(&q->mq_usage_counter);
 }
 
-void blk_mq_drain_queue(struct request_queue *q)
+static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 {
-       while (true) {
-               s64 count;
-
-               spin_lock_irq(q->queue_lock);
-               count = percpu_counter_sum(&q->mq_usage_counter);
-               spin_unlock_irq(q->queue_lock);
+       struct request_queue *q =
+               container_of(ref, struct request_queue, mq_usage_counter);
 
-               if (count == 0)
-                       break;
-               blk_mq_start_hw_queues(q);
-               msleep(10);
-       }
+       wake_up_all(&q->mq_freeze_wq);
 }
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
  */
-static void blk_mq_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue(struct request_queue *q)
 {
-       bool drain;
-
        spin_lock_irq(q->queue_lock);
-       drain = !q->bypass_depth++;
-       queue_flag_set(QUEUE_FLAG_BYPASS, q);
+       q->mq_freeze_depth++;
        spin_unlock_irq(q->queue_lock);
 
-       if (drain)
-               blk_mq_drain_queue(q);
+       percpu_ref_kill(&q->mq_usage_counter);
+       blk_mq_run_queues(q, false);
+       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -147,14 +126,13 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
        bool wake = false;
 
        spin_lock_irq(q->queue_lock);
-       if (!--q->bypass_depth) {
-               queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-               wake = true;
-       }
-       WARN_ON_ONCE(q->bypass_depth < 0);
+       wake = !--q->mq_freeze_depth;
+       WARN_ON_ONCE(q->mq_freeze_depth < 0);
        spin_unlock_irq(q->queue_lock);
-       if (wake)
+       if (wake) {
+               percpu_ref_reinit(&q->mq_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
+       }
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -1798,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
        if (!q)
                goto err_hctxs;
 
-       if (percpu_counter_init(&q->mq_usage_counter, 0))
+       if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
                goto err_map;
 
        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
@@ -1891,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q)
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        blk_mq_free_hw_queues(q, set);
 
-       percpu_counter_destroy(&q->mq_usage_counter);
+       percpu_ref_exit(&q->mq_usage_counter);
 
        free_percpu(q->queue_ctx);
        kfree(q->queue_hw_ctx);
@@ -2050,8 +2028,7 @@ static int __init blk_mq_init(void)
 {
        blk_mq_cpu_init();
 
-       /* Must be called after percpu_counter_hotcpu_callback() */
-       hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+       hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
 
        return 0;
 }
index 26460884c6cd835202e90b2d5985c827495ca626..ca4964a6295d48490f22e25f513c39165af190a4 100644 (file)
@@ -28,7 +28,7 @@ struct blk_mq_ctx {
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
-void blk_mq_drain_queue(struct request_queue *q);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
                struct request *orig_rq);
index 23321fbab29318ae5b550216c66eb9ae2d026c52..4db5abf96b9ec1595a822efebfb9df9776af6477 100644 (file)
@@ -554,8 +554,8 @@ int blk_register_queue(struct gendisk *disk)
         * Initialization must be complete by now.  Finish the initial
         * bypass from queue allocation.
         */
-       blk_queue_bypass_end(q);
        queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+       blk_queue_bypass_end(q);
 
        ret = blk_trace_init_sysfs(dev);
        if (ret)
index a0926a6094b28a7e4e67b3a88afc993719294405..18b282ce361e12b20ac8952b1bfb1e7c178d1649 100644 (file)
@@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        fmode_t mode = file->f_mode;
        struct backing_dev_info *bdi;
        loff_t size;
+       unsigned int max_sectors;
 
        /*
         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -719,8 +720,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case BLKSSZGET: /* get block device hardware sector size */
                return compat_put_int(arg, bdev_logical_block_size(bdev));
        case BLKSECTGET:
-               return compat_put_ushort(arg,
-                                        queue_max_sectors(bdev_get_queue(bdev)));
+               max_sectors = min_t(unsigned int, USHRT_MAX,
+                                   queue_max_sectors(bdev_get_queue(bdev)));
+               return compat_put_ushort(arg, max_sectors);
        case BLKROTATIONAL:
                return compat_put_ushort(arg,
                                         !blk_queue_nonrot(bdev_get_queue(bdev)));
index 7d5c3b20af451a111834efbabbdd83269085f123..d6cda8147c91ea828ea0dcd45f21d4ec6773abd6 100644 (file)
@@ -278,6 +278,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        struct backing_dev_info *bdi;
        loff_t size;
        int ret, n;
+       unsigned int max_sectors;
 
        switch(cmd) {
        case BLKFLSBUF:
@@ -375,7 +376,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKDISCARDZEROES:
                return put_uint(arg, bdev_discard_zeroes_data(bdev));
        case BLKSECTGET:
-               return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+               max_sectors = min_t(unsigned int, USHRT_MAX,
+                                   queue_max_sectors(bdev_get_queue(bdev)));
+               return put_ushort(arg, max_sectors);
        case BLKROTATIONAL:
                return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
        case BLKRASET:
index 43be471d9b1dd8e6771b31328be679f0b53821d9..f3ed7b2d89bf5eae8e4c128472c03fb174330e24 100644 (file)
@@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state)
                numlvs = be16_to_cpu(p->numlvs);
                put_dev_sector(sect);
        }
-       lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+       lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
        if (!lvip)
                return 0;
        if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
@@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state)
                                continue;
                        }
                        lv_ix = be16_to_cpu(p->lv_ix) - 1;
-                       if (lv_ix > state->limit) {
+                       if (lv_ix >= state->limit) {
                                cur_lv_ix = -1;
                                continue;
                        }
index 70cbf44a1560d5f78a1f0cf0c8b340ec315e4899..2b13533d60a294e43f460fe1276256e34ef6b256 100644 (file)
@@ -7,6 +7,8 @@
  *  Re-organised Feb 1998 Russell King
  */
 
+#define pr_fmt(fmt) fmt
+
 #include <linux/types.h>
 #include <linux/affs_hardblocks.h>
 
@@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state)
                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
-                               printk("Dev %s: unable to read RDB block %d\n",
+                               pr_err("Dev %s: unable to read RDB block %d\n",
                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
@@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state)
                *(__be32 *)(data+0xdc) = 0;
                if (checksum_block((__be32 *)data,
                                be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
-                       printk("Warning: Trashed word at 0xd0 in block %d "
-                               "ignored in checksum calculation\n",blk);
+                       pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+                              blk);
                        break;
                }
 
-               printk("Dev %s: RDB in block %d has bad checksum\n",
+               pr_err("Dev %s: RDB in block %d has bad checksum\n",
                       bdevname(state->bdev, b), blk);
        }
 
@@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state)
                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
-                               printk("Dev %s: unable to read partition block %d\n",
+                               pr_err("Dev %s: unable to read partition block %d\n",
                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
index dc51f467a560558ab4812d339fe2bd24f83a00b2..56d08fd75b1a9511152eb120b6439f4afe01a00a 100644 (file)
@@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn);
 /**
  * efi_crc32() - EFI version of crc32 function
  * @buf: buffer to calculate crc32 of
- * @len - length of buf
+ * @len: length of buf
  *
  * Description: Returns EFI-style CRC32 value for @buf
  * 
@@ -240,10 +240,10 @@ done:
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * @buffer: destination buffer
+ * @count: bytes to read
  *
  * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
@@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
+ * @state: disk parsed partitions
+ * @gpt: GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
@@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
  * and fills a GPT header starting at @ from @state->bdev.
@@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @lba: logical block address of the GPT header to test
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
  *
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
@@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 
 /**
  * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
+ * @pte:pte to check
+ * @lastlba: last lba of the disk
  *
  * Description: returns 1 if valid,  0 on error.
  */
@@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba)
 
 /**
  * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
+ * @pgpt: primary GPT header
+ * @agpt: alternate GPT header
+ * @lastlba: last LBA number
+ *
  * Description: Returns nothing.  Sanity checks pgpt and agpt fields
  * and prints warnings on discrepancies.
  * 
@@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
  * Description: Returns 1 if valid, 0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  * Validity depends on PMBR being valid (or being overridden by the
@@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 /**
  * efi_partition(struct parsed_partitions *state)
- * @state
+ * @state: disk parsed partitions
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
index 9123f250b425170b325943c351f0c34fac3562ff..93e7c1b32eddd5aa27fc8c96f5f581f712541a53 100644 (file)
@@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state,
                /*
                 * First process the data partition(s)
                 */
-               for (i=0; i<4; i++, p++) {
+               for (i = 0; i < 4; i++, p++) {
                        sector_t offs, size, next;
+
                        if (!nr_sects(p) || is_extended_partition(p))
                                continue;
 
@@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state,
                 * It should be a link to the next logical partition.
                 */
                p -= 4;
-               for (i=0; i<4; i++, p++)
+               for (i = 0; i < 4; i++, p++)
                        if (nr_sects(p) && is_extended_partition(p))
                                break;
                if (i == 4)
@@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state,
                return;
        }
        /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
-       max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
-       for (i=0; i<max_nparts && state->next<state->limit; i++) {
+       max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+       for (i = 0; i < max_nparts && state->next < state->limit; i++) {
                struct solaris_x86_slice *s = &v->v_slice[i];
                char tmp[3 + 10 + 1 + 1];
 
@@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state,
        /* The first sector of a Minix partition can have either
         * a secondary MBR describing its subpartitions, or
         * the normal boot sector. */
-       if (msdos_magic_present (data + 510) &&
+       if (msdos_magic_present(data + 510) &&
            SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
                char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
 
@@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state)
        for (slot = 1 ; slot <= 4 ; slot++, p++) {
                sector_t start = start_sect(p)*sector_size;
                sector_t size = nr_sects(p)*sector_size;
+
                if (!size)
                        continue;
                if (is_extended_partition(p)) {
@@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state)
                         * sector, although it may not be enough/proper.
                         */
                        sector_t n = 2;
+
                        n = min(size, max(sector_size, n));
                        put_partition(state, slot, start, n);
 
index 14695c6221c821588592f65fb72507daf230a358..51bf5155ee756a4ac479e9c49fcf88824b0aeedc 100644 (file)
@@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
        return err;
 }
 
+static int max_sectors_bytes(struct request_queue *q)
+{
+       unsigned int max_sectors = queue_max_sectors(q);
+
+       max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
+
+       return max_sectors << 9;
+}
+
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-       unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
+       int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
 
        return put_user(val, p);
 }
@@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
        if (size < 0)
                return -EINVAL;
-       if (size > (queue_max_sectors(q) << 9))
-               size = queue_max_sectors(q) << 9;
 
-       q->sg_reserved_size = size;
+       q->sg_reserved_size = min(size, max_sectors_bytes(q));
        return 0;
 }
 
@@ -283,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
        unsigned long start_time;
        ssize_t ret = 0;
        int writing = 0;
+       int at_head = 0;
        struct request *rq;
        char sense[SCSI_SENSE_BUFFERSIZE];
        struct bio *bio;
@@ -306,6 +314,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
                case SG_DXFER_FROM_DEV:
                        break;
                }
+       if (hdr->flags & SG_FLAG_Q_AT_HEAD)
+               at_head = 1;
 
        rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
        if (!rq)
@@ -362,7 +372,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
         * (if he doesn't check that is his problem).
         * N.B. a non-zero SCSI status is _not_ necessarily an error.
         */
-       blk_execute_rq(q, bd_disk, rq, 0);
+       blk_execute_rq(q, bd_disk, rq, at_head);
 
        hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
index 8b450338075eca905b91c04e16ce3567e54f73d9..4464e353c1e81fd2faf3fe823bf244807fa0afd3 100644 (file)
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
 drbd-y += drbd_interval.o drbd_state.o
 drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
 
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
index 05a1780ffa850483cdf4d73a89b3a9a78964ed4c..d26a3fa6368849ce95d59a7847c8853c11861026 100644 (file)
@@ -92,34 +92,26 @@ struct __packed al_transaction_on_disk {
        __be32  context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-struct update_odbm_work {
-       struct drbd_work w;
-       struct drbd_device *device;
-       unsigned int enr;
-};
-
-struct update_al_work {
-       struct drbd_work w;
-       struct drbd_device *device;
-       struct completion event;
-       int err;
-};
-
-
-void *drbd_md_get_buffer(struct drbd_device *device)
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
 {
        int r;
 
        wait_event(device->misc_wait,
-                  (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+                  (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
                   device->state.disk <= D_FAILED);
 
-       return r ? NULL : page_address(device->md_io_page);
+       if (r)
+               return NULL;
+
+       device->md_io.current_use = intent;
+       device->md_io.start_jif = jiffies;
+       device->md_io.submit_jif = device->md_io.start_jif - 1;
+       return page_address(device->md_io.page);
 }
 
 void drbd_md_put_buffer(struct drbd_device *device)
 {
-       if (atomic_dec_and_test(&device->md_io_in_use))
+       if (atomic_dec_and_test(&device->md_io.in_use))
                wake_up(&device->misc_wait);
 }
 
@@ -145,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
 
 static int _drbd_md_sync_page_io(struct drbd_device *device,
                                 struct drbd_backing_dev *bdev,
-                                struct page *page, sector_t sector,
-                                int rw, int size)
+                                sector_t sector, int rw)
 {
        struct bio *bio;
+       /* we do all our meta data IO in aligned 4k blocks. */
+       const int size = 4096;
        int err;
 
        device->md_io.done = 0;
@@ -156,15 +149,15 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 
        if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
                rw |= REQ_FUA | REQ_FLUSH;
-       rw |= REQ_SYNC;
+       rw |= REQ_SYNC | REQ_NOIDLE;
 
        bio = bio_alloc_drbd(GFP_NOIO);
        bio->bi_bdev = bdev->md_bdev;
        bio->bi_iter.bi_sector = sector;
        err = -EIO;
-       if (bio_add_page(bio, page, size, 0) != size)
+       if (bio_add_page(bio, device->md_io.page, size, 0) != size)
                goto out;
-       bio->bi_private = &device->md_io;
+       bio->bi_private = device;
        bio->bi_end_io = drbd_md_io_complete;
        bio->bi_rw = rw;
 
@@ -179,7 +172,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
        }
 
        bio_get(bio); /* one bio_put() is in the completion handler */
-       atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+       atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+       device->md_io.submit_jif = jiffies;
        if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
                bio_endio(bio, -EIO);
        else
@@ -197,9 +191,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
                         sector_t sector, int rw)
 {
        int err;
-       struct page *iop = device->md_io_page;
-
-       D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+       D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
 
        BUG_ON(!bdev->md_bdev);
 
@@ -214,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
                     current->comm, current->pid, __func__,
                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-       /* we do all our meta data IO in aligned 4k blocks. */
-       err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+       err = _drbd_md_sync_page_io(device, bdev, sector, rw);
        if (err) {
                drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -297,26 +288,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
        return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device, bool delegate);
-
-/* When called through generic_make_request(), we must delegate
- * activity log I/O to the worker thread: a further request
- * submitted via generic_make_request() within the same task
- * would be queued on current->bio_list, and would only start
- * after this function returns (see generic_make_request()).
- *
- * However, if we *are* the worker, we must not delegate to ourselves.
- */
+static int al_write_transaction(struct drbd_device *device);
 
-/*
- * @delegate:   delegate activity log I/O to the worker thread
- */
-void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+void drbd_al_begin_io_commit(struct drbd_device *device)
 {
        bool locked = false;
 
-       BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
        /* Serialize multiple transactions.
         * This uses test_and_set_bit, memory barrier is implicit.
         */
@@ -335,7 +312,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
                        rcu_read_unlock();
 
                        if (write_al_updates)
-                               al_write_transaction(device, delegate);
+                               al_write_transaction(device);
                        spin_lock_irq(&device->al_lock);
                        /* FIXME
                        if (err)
@@ -352,12 +329,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 /*
  * @delegate:   delegate activity log I/O to the worker thread
  */
-void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
 {
-       BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
        if (drbd_al_begin_io_prepare(device, i))
-               drbd_al_begin_io_commit(device, delegate);
+               drbd_al_begin_io_commit(device);
 }
 
 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
@@ -380,8 +355,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
        /* We want all necessary updates for a given request within the same transaction
         * We could first check how many updates are *actually* needed,
         * and use that instead of the worst-case nr_al_extents */
-       if (available_update_slots < nr_al_extents)
-               return -EWOULDBLOCK;
+       if (available_update_slots < nr_al_extents) {
+               /* Too many activity log extents are currently "hot".
+                *
+                * If we have accumulated pending changes already,
+                * we made progress.
+                *
+                * If we cannot get even a single pending change through,
+                * stop the fast path until we made some progress,
+                * or requests to "cold" extents could be starved. */
+               if (!al->pending_changes)
+                       __set_bit(__LC_STARVING, &device->act_log->flags);
+               return -ENOBUFS;
+       }
 
        /* Is resync active in this area? */
        for (enr = first; enr <= last; enr++) {
@@ -452,15 +438,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr)
                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 }
 
-static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
-{
-       return rs_enr >>
-               /* bit to page */
-               ((PAGE_SHIFT + 3) -
-               /* resync extent number to bit */
-                (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
-}
-
 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 {
        const unsigned int stripes = device->ldev->md.al_stripes;
@@ -479,8 +456,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
        return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
 }
 
-static int
-_al_write_transaction(struct drbd_device *device)
+int al_write_transaction(struct drbd_device *device)
 {
        struct al_transaction_on_disk *buffer;
        struct lc_element *e;
@@ -505,7 +481,8 @@ _al_write_transaction(struct drbd_device *device)
                return -EIO;
        }
 
-       buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+       /* protects md_io_buffer, al_tr_cycle, ... */
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer) {
                drbd_err(device, "disk failed while waiting for md_io buffer\n");
                put_ldev(device);
@@ -590,38 +567,6 @@ _al_write_transaction(struct drbd_device *device)
        return err;
 }
 
-
-static int w_al_write_transaction(struct drbd_work *w, int unused)
-{
-       struct update_al_work *aw = container_of(w, struct update_al_work, w);
-       struct drbd_device *device = aw->device;
-       int err;
-
-       err = _al_write_transaction(device);
-       aw->err = err;
-       complete(&aw->event);
-
-       return err != -EIO ? err : 0;
-}
-
-/* Calls from worker context (see w_restart_disk_io()) need to write the
-   transaction directly. Others came through generic_make_request(),
-   those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_device *device, bool delegate)
-{
-       if (delegate) {
-               struct update_al_work al_work;
-               init_completion(&al_work.event);
-               al_work.w.cb = w_al_write_transaction;
-               al_work.device = device;
-               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                     &al_work.w);
-               wait_for_completion(&al_work.event);
-               return al_work.err;
-       } else
-               return _al_write_transaction(device);
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
        int rv;
@@ -682,72 +627,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
        return 0;
 }
 
-static int w_update_odbm(struct drbd_work *w, int unused)
-{
-       struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
-       struct drbd_device *device = udw->device;
-       struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
-
-       if (!get_ldev(device)) {
-               if (__ratelimit(&drbd_ratelimit_state))
-                       drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
-               kfree(udw);
-               return 0;
-       }
-
-       drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
-       put_ldev(device);
-
-       kfree(udw);
-
-       if (drbd_bm_total_weight(device) <= device->rs_failed) {
-               switch (device->state.conn) {
-               case C_SYNC_SOURCE:  case C_SYNC_TARGET:
-               case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
-                       drbd_resync_finished(device);
-               default:
-                       /* nothing to do */
-                       break;
-               }
-       }
-       drbd_bcast_event(device, &sib);
-
-       return 0;
-}
-
+static const char *drbd_change_sync_fname[] = {
+       [RECORD_RS_FAILED] = "drbd_rs_failed_io",
+       [SET_IN_SYNC] = "drbd_set_in_sync",
+       [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
 
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
  * resync LRU-cache are 16MB each.
  * The caller of this function has to hold an get_ldev() reference.
  *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
  * TODO will be obsoleted once we have a caching lru of the on disk bitmap
  */
-static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
-                                     int count, int success)
+static bool update_rs_extent(struct drbd_device *device,
+               unsigned int enr, int count,
+               enum update_sync_bits_mode mode)
 {
        struct lc_element *e;
-       struct update_odbm_work *udw;
-
-       unsigned int enr;
 
        D_ASSERT(device, atomic_read(&device->local_cnt));
 
-       /* I simply assume that a sector/size pair never crosses
-        * a 16 MB extent border. (Currently this is true...) */
-       enr = BM_SECT_TO_EXT(sector);
-
-       e = lc_get(device->resync, enr);
+       /* When setting out-of-sync bits,
+        * we don't need it cached (lc_find).
+        * But if it is present in the cache,
+        * we should update the cached bit count.
+        * Otherwise, that extent should be in the resync extent lru cache
+        * already -- or we want to pull it in if necessary -- (lc_get),
+        * then update and check rs_left and rs_failed. */
+       if (mode == SET_OUT_OF_SYNC)
+               e = lc_find(device->resync, enr);
+       else
+               e = lc_get(device->resync, enr);
        if (e) {
                struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
                if (ext->lce.lc_number == enr) {
-                       if (success)
+                       if (mode == SET_IN_SYNC)
                                ext->rs_left -= count;
+                       else if (mode == SET_OUT_OF_SYNC)
+                               ext->rs_left += count;
                        else
                                ext->rs_failed += count;
                        if (ext->rs_left < ext->rs_failed) {
-                               drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+                               drbd_warn(device, "BAD! enr=%u rs_left=%d "
                                    "rs_failed=%d count=%d cstate=%s\n",
-                                    (unsigned long long)sector,
                                     ext->lce.lc_number, ext->rs_left,
                                     ext->rs_failed, count,
                                     drbd_conn_str(device->state.conn));
@@ -781,34 +710,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
                                     ext->lce.lc_number, ext->rs_failed);
                        }
                        ext->rs_left = rs_left;
-                       ext->rs_failed = success ? 0 : count;
+                       ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
                        /* we don't keep a persistent log of the resync lru,
                         * we can commit any change right away. */
                        lc_committed(device->resync);
                }
-               lc_put(device->resync, &ext->lce);
+               if (mode != SET_OUT_OF_SYNC)
+                       lc_put(device->resync, &ext->lce);
                /* no race, we are within the al_lock! */
 
-               if (ext->rs_left == ext->rs_failed) {
+               if (ext->rs_left <= ext->rs_failed) {
                        ext->rs_failed = 0;
-
-                       udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
-                       if (udw) {
-                               udw->enr = ext->lce.lc_number;
-                               udw->w.cb = w_update_odbm;
-                               udw->device = device;
-                               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                                     &udw->w);
-                       } else {
-                               drbd_warn(device, "Could not kmalloc an udw\n");
-                       }
+                       return true;
                }
-       } else {
+       } else if (mode != SET_OUT_OF_SYNC) {
+               /* be quiet if lc_find() did not find it. */
                drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
                    device->resync_locked,
                    device->resync->nr_elements,
                    device->resync->flags);
        }
+       return false;
 }
 
 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -827,105 +749,105 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
        }
 }
 
-/* clear the bit corresponding to the piece of storage in question:
- * size byte of data starting from sector.  Only clear a bits of the affected
- * one ore more _aligned_ BM_BLOCK_SIZE blocks.
- *
- * called by worker on C_SYNC_TARGET and receiver on SyncSource.
- *
- */
-void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
-                      const char *file, const unsigned int line)
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
 {
-       /* Is called from worker and receiver context _only_ */
-       unsigned long sbnr, ebnr, lbnr;
-       unsigned long count = 0;
-       sector_t esector, nr_sectors;
-       int wake_up = 0;
-       unsigned long flags;
+       return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
 
-       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
-                               (unsigned long long)sector, size);
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+       if (rs_done)
+               set_bit(RS_DONE, &device->flags);
+               /* and also set RS_PROGRESS below */
+       else if (!lazy_bitmap_update_due(device))
                return;
-       }
-
-       if (!get_ldev(device))
-               return; /* no disk, no metadata, no bitmap to clear bits in */
-
-       nr_sectors = drbd_get_capacity(device->this_bdev);
-       esector = sector + (size >> 9) - 1;
-
-       if (!expect(sector < nr_sectors))
-               goto out;
-       if (!expect(esector < nr_sectors))
-               esector = nr_sectors - 1;
-
-       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-       /* we clear it (in sync).
-        * round up start sector, round down end sector.  we make sure we only
-        * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
-       if (unlikely(esector < BM_SECT_PER_BIT-1))
-               goto out;
-       if (unlikely(esector == (nr_sectors-1)))
-               ebnr = lbnr;
-       else
-               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 
-       if (sbnr > ebnr)
-               goto out;
+       drbd_device_post_work(device, RS_PROGRESS);
+}
 
+static int update_sync_bits(struct drbd_device *device,
+               unsigned long sbnr, unsigned long ebnr,
+               enum update_sync_bits_mode mode)
+{
        /*
-        * ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.
+        * We keep a count of set bits per resync-extent in the ->rs_left
+        * caching member, so we need to loop and work within the resync extent
+        * alignment. Typically this loop will execute exactly once.
         */
-       count = drbd_bm_clear_bits(device, sbnr, ebnr);
-       if (count) {
-               drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
-               spin_lock_irqsave(&device->al_lock, flags);
-               drbd_try_clear_on_disk_bm(device, sector, count, true);
-               spin_unlock_irqrestore(&device->al_lock, flags);
-
-               /* just wake_up unconditional now, various lc_chaged(),
-                * lc_put() in drbd_try_clear_on_disk_bm(). */
-               wake_up = 1;
+       unsigned long flags;
+       unsigned long count = 0;
+       unsigned int cleared = 0;
+       while (sbnr <= ebnr) {
+               /* set temporary boundary bit number to last bit number within
+                * the resync extent of the current start bit number,
+                * but cap at provided end bit number */
+               unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+               unsigned long c;
+
+               if (mode == RECORD_RS_FAILED)
+                       /* Only called from drbd_rs_failed_io(), bits
+                        * supposedly still set.  Recount, maybe some
+                        * of the bits have been successfully cleared
+                        * by application IO meanwhile.
+                        */
+                       c = drbd_bm_count_bits(device, sbnr, tbnr);
+               else if (mode == SET_IN_SYNC)
+                       c = drbd_bm_clear_bits(device, sbnr, tbnr);
+               else /* if (mode == SET_OUT_OF_SYNC) */
+                       c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+               if (c) {
+                       spin_lock_irqsave(&device->al_lock, flags);
+                       cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+                       spin_unlock_irqrestore(&device->al_lock, flags);
+                       count += c;
+               }
+               sbnr = tbnr + 1;
        }
-out:
-       put_ldev(device);
-       if (wake_up)
+       if (count) {
+               if (mode == SET_IN_SYNC) {
+                       unsigned long still_to_go = drbd_bm_total_weight(device);
+                       bool rs_is_done = (still_to_go <= device->rs_failed);
+                       drbd_advance_rs_marks(device, still_to_go);
+                       if (cleared || rs_is_done)
+                               maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+               } else if (mode == RECORD_RS_FAILED)
+                       device->rs_failed += count;
                wake_up(&device->al_wait);
+       }
+       return count;
 }
 
-/*
- * this is intended to set one request worth of data out of sync.
- * affects at least 1 bit,
- * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
  *
- * called by tl_clear and drbd_send_dblock (==drbd_make_request).
- * so this can be _any_ process.
  */
-int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
-                           const char *file, const unsigned int line)
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+               enum update_sync_bits_mode mode,
+               const char *file, const unsigned int line)
 {
-       unsigned long sbnr, ebnr, flags;
+       /* Is called from worker and receiver context _only_ */
+       unsigned long sbnr, ebnr, lbnr;
+       unsigned long count = 0;
        sector_t esector, nr_sectors;
-       unsigned int enr, count = 0;
-       struct lc_element *e;
 
-       /* this should be an empty REQ_FLUSH */
-       if (size == 0)
+       /* This would be an empty REQ_FLUSH, be silent. */
+       if ((mode == SET_OUT_OF_SYNC) && size == 0)
                return 0;
 
-       if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "sector: %llus, size: %d\n",
-                       (unsigned long long)sector, size);
+       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+               drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+                               drbd_change_sync_fname[mode],
+                               (unsigned long long)sector, size);
                return 0;
        }
 
        if (!get_ldev(device))
-               return 0; /* no disk, no metadata, no bitmap to set bits in */
+               return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
 
        nr_sectors = drbd_get_capacity(device->this_bdev);
        esector = sector + (size >> 9) - 1;
@@ -935,25 +857,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
        if (!expect(esector < nr_sectors))
                esector = nr_sectors - 1;
 
-       /* we set it out of sync,
-        * we do not need to round anything here */
-       sbnr = BM_SECT_TO_BIT(sector);
-       ebnr = BM_SECT_TO_BIT(esector);
-
-       /* ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.  */
-       spin_lock_irqsave(&device->al_lock, flags);
-       count = drbd_bm_set_bits(device, sbnr, ebnr);
+       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
-       enr = BM_SECT_TO_EXT(sector);
-       e = lc_find(device->resync, enr);
-       if (e)
-               lc_entry(e, struct bm_extent, lce)->rs_left += count;
-       spin_unlock_irqrestore(&device->al_lock, flags);
+       if (mode == SET_IN_SYNC) {
+               /* Round up start sector, round down end sector.  We make sure
+                * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+               if (unlikely(esector < BM_SECT_PER_BIT-1))
+                       goto out;
+               if (unlikely(esector == (nr_sectors-1)))
+                       ebnr = lbnr;
+               else
+                       ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+               sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+       } else {
+               /* We set it out of sync, or record resync failure.
+                * Should not round anything here. */
+               sbnr = BM_SECT_TO_BIT(sector);
+               ebnr = BM_SECT_TO_BIT(esector);
+       }
 
+       count = update_sync_bits(device, sbnr, ebnr, mode);
 out:
        put_ldev(device);
-
        return count;
 }
 
@@ -1075,6 +1000,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
        struct lc_element *e;
        struct bm_extent *bm_ext;
        int i;
+       bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+       /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+        * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+        * need to throttle. There is at most one such half-locked extent,
+        * which is remembered in resync_wenr. */
+
+       if (throttle && device->resync_wenr != enr)
+               return -EAGAIN;
 
        spin_lock_irq(&device->al_lock);
        if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1098,8 +1032,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
                        D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
                        clear_bit(BME_NO_WRITES, &bm_ext->flags);
                        device->resync_wenr = LC_FREE;
-                       if (lc_put(device->resync, &bm_ext->lce) == 0)
+                       if (lc_put(device->resync, &bm_ext->lce) == 0) {
+                               bm_ext->flags = 0;
                                device->resync_locked--;
+                       }
                        wake_up(&device->al_wait);
                } else {
                        drbd_alert(device, "LOGIC BUG\n");
@@ -1161,8 +1097,20 @@ proceed:
        return 0;
 
 try_again:
-       if (bm_ext)
-               device->resync_wenr = enr;
+       if (bm_ext) {
+               if (throttle) {
+                       D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+                       D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+                       clear_bit(BME_NO_WRITES, &bm_ext->flags);
+                       device->resync_wenr = LC_FREE;
+                       if (lc_put(device->resync, &bm_ext->lce) == 0) {
+                               bm_ext->flags = 0;
+                               device->resync_locked--;
+                       }
+                       wake_up(&device->al_wait);
+               } else
+                       device->resync_wenr = enr;
+       }
        spin_unlock_irq(&device->al_lock);
        return -EAGAIN;
 }
@@ -1270,69 +1218,3 @@ int drbd_rs_del_all(struct drbd_device *device)
 
        return 0;
 }
-
-/**
- * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
- * @device:    DRBD device.
- * @sector:    The sector number.
- * @size:      Size of failed IO operation, in byte.
- */
-void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
-{
-       /* Is called from worker and receiver context _only_ */
-       unsigned long sbnr, ebnr, lbnr;
-       unsigned long count;
-       sector_t esector, nr_sectors;
-       int wake_up = 0;
-
-       if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-               drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
-                               (unsigned long long)sector, size);
-               return;
-       }
-       nr_sectors = drbd_get_capacity(device->this_bdev);
-       esector = sector + (size >> 9) - 1;
-
-       if (!expect(sector < nr_sectors))
-               return;
-       if (!expect(esector < nr_sectors))
-               esector = nr_sectors - 1;
-
-       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-       /*
-        * round up start sector, round down end sector.  we make sure we only
-        * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
-       if (unlikely(esector < BM_SECT_PER_BIT-1))
-               return;
-       if (unlikely(esector == (nr_sectors-1)))
-               ebnr = lbnr;
-       else
-               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
-
-       if (sbnr > ebnr)
-               return;
-
-       /*
-        * ok, (capacity & 7) != 0 sometimes, but who cares...
-        * we count rs_{total,left} in bits, not sectors.
-        */
-       spin_lock_irq(&device->al_lock);
-       count = drbd_bm_count_bits(device, sbnr, ebnr);
-       if (count) {
-               device->rs_failed += count;
-
-               if (get_ldev(device)) {
-                       drbd_try_clear_on_disk_bm(device, sector, count, false);
-                       put_ldev(device);
-               }
-
-               /* just wake_up unconditional now, various lc_chaged(),
-                * lc_put() in drbd_try_clear_on_disk_bm(). */
-               wake_up = 1;
-       }
-       spin_unlock_irq(&device->al_lock);
-       if (wake_up)
-               wake_up(&device->al_wait);
-}
index 1aa29f8fdfe1feb0f6d5d534cbdaf3fac90a530c..426c97aef9002193c6c7790040c95bdb2ec71eac 100644 (file)
@@ -22,6 +22,8 @@
    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/bitops.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
@@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 
        for (i = 0; i < number; i++) {
                if (!pages[i]) {
-                       printk(KERN_ALERT "drbd: bm_free_pages tried to free "
-                                         "a NULL pointer; i=%lu n=%lu\n",
-                                         i, number);
+                       pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+                                i, number);
                        continue;
                }
                __free_page(pages[i]);
@@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
        end = offset + len;
 
        if (end > b->bm_words) {
-               printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+               pr_alert("bm_memset end > bm_words\n");
                return;
        }
 
@@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
                p_addr = bm_map_pidx(b, idx);
                bm = p_addr + MLPP(offset);
                if (bm+do_now > p_addr + LWPP) {
-                       printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+                       pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
                               p_addr, bm, (int)do_now);
                } else
                        memset(bm, c, do_now * sizeof(long));
@@ -927,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
        spin_unlock_irq(&b->bm_lock);
 }
 
-struct bm_aio_ctx {
-       struct drbd_device *device;
-       atomic_t in_flight;
-       unsigned int done;
-       unsigned flags;
-#define BM_AIO_COPY_PAGES      1
-#define BM_AIO_WRITE_HINTED    2
-#define BM_WRITE_ALL_PAGES     4
-       int error;
-       struct kref kref;
-};
-
-static void bm_aio_ctx_destroy(struct kref *kref)
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 {
-       struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+       struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+       unsigned long flags;
 
+       spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+       list_del(&ctx->list);
+       spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
        put_ldev(ctx->device);
        kfree(ctx);
 }
@@ -950,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
-       struct bm_aio_ctx *ctx = bio->bi_private;
+       struct drbd_bm_aio_ctx *ctx = bio->bi_private;
        struct drbd_device *device = ctx->device;
        struct drbd_bitmap *b = device->bitmap;
        unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -993,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
        if (atomic_dec_and_test(&ctx->in_flight)) {
                ctx->done = 1;
                wake_up(&device->misc_wait);
-               kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+               kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
        }
 }
 
-static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
        struct bio *bio = bio_alloc_drbd(GFP_NOIO);
        struct drbd_device *device = ctx->device;
        struct drbd_bitmap *b = device->bitmap;
        struct page *page;
        unsigned int len;
+       unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
 
        sector_t on_disk_sector =
                device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1049,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-       struct bm_aio_ctx *ctx;
+       struct drbd_bm_aio_ctx *ctx;
        struct drbd_bitmap *b = device->bitmap;
        int num_pages, i, count = 0;
        unsigned long now;
@@ -1067,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
         * as we submit copies of pages anyways.
         */
 
-       ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+       ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
        if (!ctx)
                return -ENOMEM;
 
-       *ctx = (struct bm_aio_ctx) {
+       *ctx = (struct drbd_bm_aio_ctx) {
                .device = device,
+               .start_jif = jiffies,
                .in_flight = ATOMIC_INIT(1),
                .done = 0,
                .flags = flags,
@@ -1080,15 +1075,21 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                .kref = { ATOMIC_INIT(2) },
        };
 
-       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
                drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
                kfree(ctx);
                return -ENODEV;
        }
+       /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+          drbd_adm_attach(), after device->ldev was assigned. */
 
-       if (!ctx->flags)
+       if (0 == (ctx->flags & ~BM_AIO_READ))
                WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
+       spin_lock_irq(&device->resource->req_lock);
+       list_add_tail(&ctx->list, &device->pending_bitmap_io);
+       spin_unlock_irq(&device->resource->req_lock);
+
        num_pages = b->bm_number_of_pages;
 
        now = jiffies;
@@ -1098,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                /* ignore completely unchanged pages */
                if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
                        break;
-               if (rw & WRITE) {
+               if (!(flags & BM_AIO_READ)) {
                        if ((flags & BM_AIO_WRITE_HINTED) &&
                            !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
                                    &page_private(b->bm_pages[i])))
                                continue;
 
-                       if (!(flags & BM_WRITE_ALL_PAGES) &&
+                       if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
                            bm_test_page_unchanged(b->bm_pages[i])) {
                                dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
                                continue;
@@ -1118,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                        }
                }
                atomic_inc(&ctx->in_flight);
-               bm_page_io_async(ctx, i, rw);
+               bm_page_io_async(ctx, i);
                ++count;
                cond_resched();
        }
@@ -1134,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
        if (!atomic_dec_and_test(&ctx->in_flight))
                wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
        else
-               kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+               kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 
        /* summary for global bitmap IO */
        if (flags == 0)
                drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
-                        rw == WRITE ? "WRITE" : "READ",
+                        (flags & BM_AIO_READ) ? "READ" : "WRITE",
                         count, jiffies - now);
 
        if (ctx->error) {
@@ -1152,20 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
                err = -EIO; /* Disk timeout/force-detach during IO... */
 
        now = jiffies;
-       if (rw == WRITE) {
-               drbd_md_flush(device);
-       } else /* rw == READ */ {
+       if (flags & BM_AIO_READ) {
                b->bm_set = bm_count_bits(b);
                drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
                     jiffies - now);
        }
        now = b->bm_set;
 
-       if (flags == 0)
+       if ((flags & ~BM_AIO_READ) == 0)
                drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
                     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
-       kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+       kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
        return err;
 }
 
@@ -1175,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
  */
 int drbd_bm_read(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, READ, 0, 0);
+       return bm_rw(device, BM_AIO_READ, 0);
 }
 
 /**
@@ -1186,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, 0, 0);
+       return bm_rw(device, 0, 0);
 }
 
 /**
@@ -1197,7 +1196,17 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+       return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:    DRBD device.
+ * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+       return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
 }
 
 /**
@@ -1213,7 +1222,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+       return bm_rw(device, BM_AIO_COPY_PAGES, 0);
 }
 
 /**
@@ -1222,62 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
 {
-       return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
-}
-
-/**
- * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
- * @device:    DRBD device.
- * @idx:       bitmap page index
- *
- * We don't want to special case on logical_block_size of the backend device,
- * so we submit PAGE_SIZE aligned pieces.
- * Note that on "most" systems, PAGE_SIZE is 4k.
- *
- * In case this becomes an issue on systems with larger PAGE_SIZE,
- * we may want to change this again to write 4k aligned 4k pieces.
- */
-int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
-{
-       struct bm_aio_ctx *ctx;
-       int err;
-
-       if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
-               dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
-               return 0;
-       }
-
-       ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
-       if (!ctx)
-               return -ENOMEM;
-
-       *ctx = (struct bm_aio_ctx) {
-               .device = device,
-               .in_flight = ATOMIC_INIT(1),
-               .done = 0,
-               .flags = BM_AIO_COPY_PAGES,
-               .error = 0,
-               .kref = { ATOMIC_INIT(2) },
-       };
-
-       if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
-               drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
-               kfree(ctx);
-               return -ENODEV;
-       }
-
-       bm_page_io_async(ctx, idx, WRITE_SYNC);
-       wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
-
-       if (ctx->error)
-               drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-               /* that causes us to detach, so the in memory bitmap will be
-                * gone in a moment as well. */
-
-       device->bm_writ_cnt++;
-       err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
-       kref_put(&ctx->kref, &bm_aio_ctx_destroy);
-       return err;
+       return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
 }
 
 /* NOTE
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644 (file)
index 0000000..5c20b18
--- /dev/null
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+       if (valid)
+               seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+       else
+               seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+       bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+       if (is_set && set_name) {
+               seq_putc(m, *sep);
+               seq_puts(m, set_name);
+               *sep = '|';
+       } else if (!is_set && unset_name) {
+               seq_putc(m, *sep);
+               seq_puts(m, unset_name);
+               *sep = '|';
+       }
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+       bool is_set, char *sep, const char *set_name)
+{
+       __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+       unsigned int s = req->rq_state;
+       char sep = ' ';
+       seq_printf(m, "\t0x%08x", s);
+       seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+       /* RQ_WRITE ignored, already reported */
+       seq_puts(m, "\tlocal:");
+       seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+       seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+       seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+       seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+       if (sep == ' ')
+               seq_puts(m, " -");
+
+       /* for_each_connection ... */
+       seq_printf(m, "\tnet:");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+       seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+       seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+       seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+       seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+       seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+       if (sep == ' ')
+               seq_puts(m, " -");
+
+       seq_printf(m, " :");
+       sep = ' ';
+       seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+       seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+       seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+       if (sep == ' ')
+               seq_puts(m, " -");
+       seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+       /* change anything here, fixup header below! */
+       unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+       seq_printf(m, "0x%x\t%llu\t%u\t%s",
+               req->epoch,
+               (unsigned long long)req->i.sector, req->i.size >> 9,
+               (s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+       seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+       seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+       seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+       seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+       seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+       seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+       seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+       seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+       seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               struct drbd_md_io tmp;
+               /* In theory this is racy,
+                * in the sense that there could have been a
+                * drbd_md_put_buffer(); drbd_md_get_buffer();
+                * between accessing these members here.  */
+               tmp = device->md_io;
+               if (atomic_read(&tmp.in_use)) {
+                       seq_printf(m, "%u\t%u\t%d\t",
+                               device->minor, device->vnr,
+                               jiffies_to_msecs(now - tmp.start_jif));
+                       if (time_before(tmp.submit_jif, tmp.start_jif))
+                               seq_puts(m, "-\t");
+                       else
+                               seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+                       seq_printf(m, "%s\n", tmp.current_use);
+               }
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               unsigned long jif;
+               struct drbd_request *req;
+               int n = atomic_read(&device->ap_actlog_cnt);
+               if (n) {
+                       spin_lock_irq(&device->resource->req_lock);
+                       req = list_first_entry_or_null(&device->pending_master_completion[1],
+                               struct drbd_request, req_pending_master_completion);
+                       /* if the oldest request does not wait for the activity log
+                        * it is not interesting for us here */
+                       if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+                               jif = req->start_jif;
+                       else
+                               req = NULL;
+                       spin_unlock_irq(&device->resource->req_lock);
+               }
+               if (n) {
+                       seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+                       if (req)
+                               seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+                       else
+                               seq_puts(m, "-\t");
+                       seq_printf(m, "%u\n", n);
+               }
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+       struct drbd_bm_aio_ctx *ctx;
+       unsigned long start_jif;
+       unsigned int in_flight;
+       unsigned int flags;
+       spin_lock_irq(&device->resource->req_lock);
+       ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+       if (ctx && ctx->done)
+               ctx = NULL;
+       if (ctx) {
+               start_jif = ctx->start_jif;
+               in_flight = atomic_read(&ctx->in_flight);
+               flags = ctx->flags;
+       }
+       spin_unlock_irq(&device->resource->req_lock);
+       if (ctx) {
+               seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+                       device->minor, device->vnr,
+                       (flags & BM_AIO_READ) ? 'R' : 'W',
+                       jiffies_to_msecs(now - start_jif),
+                       in_flight);
+       }
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               seq_print_device_bitmap_io(m, device, now);
+       }
+       rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+       unsigned long f = peer_req->flags;
+       char sep = ' ';
+
+       __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+       __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+       seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+       seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+       seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+       if (f & EE_IS_TRIM) {
+               seq_putc(m, sep);
+               sep = '|';
+               if (f & EE_IS_TRIM_USE_ZEROOUT)
+                       seq_puts(m, "zero-out");
+               else
+                       seq_puts(m, "trim");
+       }
+       seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+       struct drbd_device *device, struct list_head *lh,
+       unsigned long now)
+{
+       bool reported_preparing = false;
+       struct drbd_peer_request *peer_req;
+       list_for_each_entry(peer_req, lh, w.list) {
+               if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+                       continue;
+
+               if (device)
+                       seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+               seq_printf(m, "%llu\t%u\t%c\t%u\t",
+                       (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+                       (peer_req->flags & EE_WRITE) ? 'W' : 'R',
+                       jiffies_to_msecs(now - peer_req->submit_jif));
+               seq_print_peer_request_flags(m, peer_req);
+               if (peer_req->flags & EE_SUBMITTED)
+                       break;
+               else
+                       reported_preparing = true;
+       }
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+       struct drbd_device *device, unsigned long now)
+{
+       seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+       spin_lock_irq(&device->resource->req_lock);
+       seq_print_peer_request(m, device, &device->active_ee, now);
+       seq_print_peer_request(m, device, &device->read_ee, now);
+       seq_print_peer_request(m, device, &device->sync_ee, now);
+       spin_unlock_irq(&device->resource->req_lock);
+       if (test_bit(FLUSH_PENDING, &device->flags)) {
+               seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+                       device->minor, device->vnr,
+                       jiffies_to_msecs(now - device->flush_jif));
+       }
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+       struct drbd_resource *resource, unsigned long now)
+{
+       struct drbd_device *device;
+       unsigned int i;
+
+       rcu_read_lock();
+       idr_for_each_entry(&resource->devices, device, i) {
+               seq_print_device_peer_requests(m, device, now);
+       }
+       rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+       struct drbd_resource *resource,
+       struct drbd_connection *connection,
+       unsigned long now)
+{
+       struct drbd_request *req;
+       unsigned int count = 0;
+       unsigned int show_state = 0;
+
+       seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+       spin_lock_irq(&resource->req_lock);
+       list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+               unsigned int tmp = 0;
+               unsigned int s;
+               ++count;
+
+               /* don't disable irq "forever" */
+               if (!(count & 0x1ff)) {
+                       struct drbd_request *req_next;
+                       kref_get(&req->kref);
+                       spin_unlock_irq(&resource->req_lock);
+                       cond_resched();
+                       spin_lock_irq(&resource->req_lock);
+                       req_next = list_next_entry(req, tl_requests);
+                       if (kref_put(&req->kref, drbd_req_destroy))
+                               req = req_next;
+                       if (&req->tl_requests == &connection->transfer_log)
+                               break;
+               }
+
+               s = req->rq_state;
+
+               /* This is meant to summarize timing issues, to be able to tell
+                * local disk problems from network problems.
+                * Skip requests, if we have shown an even older request with
+                * similar aspects already.  */
+               if (req->master_bio == NULL)
+                       tmp |= 1;
+               if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+                       tmp |= 2;
+               if (s & RQ_NET_MASK) {
+                       if (!(s & RQ_NET_SENT))
+                               tmp |= 4;
+                       if (s & RQ_NET_PENDING)
+                               tmp |= 8;
+                       if (!(s & RQ_NET_DONE))
+                               tmp |= 16;
+               }
+               if ((tmp & show_state) == tmp)
+                       continue;
+               show_state |= tmp;
+               seq_printf(m, "%u\t", count);
+               seq_print_minor_vnr_req(m, req, now);
+               if (show_state == 0x1f)
+                       break;
+       }
+       spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+       struct drbd_resource *resource = m->private;
+       struct drbd_connection *connection;
+       unsigned long jif = jiffies;
+
+       connection = first_connection(resource);
+       /* This does not happen, actually.
+        * But be robust and prepare for future code changes. */
+       if (!connection || !kref_get_unless_zero(&connection->kref))
+               return -ESTALE;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, "oldest bitmap IO\n");
+       seq_print_resource_pending_bitmap_io(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "meta data IO\n");
+       seq_print_resource_pending_meta_io(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "socket buffer stats\n");
+       /* for each connection ... once we have more than one */
+       rcu_read_lock();
+       if (connection->data.socket) {
+               /* open coded SIOCINQ, the "relevant" part */
+               struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+               int answ = tp->rcv_nxt - tp->copied_seq;
+               seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+               /* open coded SIOCOUTQ, the "relevant" part */
+               answ = tp->write_seq - tp->snd_una;
+               seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+       }
+       rcu_read_unlock();
+       seq_putc(m, '\n');
+
+       seq_puts(m, "oldest peer requests\n");
+       seq_print_resource_pending_peer_requests(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "application requests waiting for activity log\n");
+       seq_print_waiting_for_AL(m, resource, jif);
+       seq_putc(m, '\n');
+
+       seq_puts(m, "oldest application requests\n");
+       seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+       seq_putc(m, '\n');
+
+       jif = jiffies - jif;
+       if (jif)
+               seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return 0;
+}
+
+/* simple_positive(file->f_dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above.  :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+                               void *data, struct kref *kref,
+                               void (*release)(struct kref *))
+{
+       struct dentry *parent;
+       int ret = -ESTALE;
+
+       /* Are we still linked,
+        * or has debugfs_remove() already been called? */
+       parent = file->f_dentry->d_parent;
+       /* not sure if this can happen: */
+       if (!parent || !parent->d_inode)
+               goto out;
+       /* serialize with d_delete() */
+       mutex_lock(&parent->d_inode->i_mutex);
+       /* Make sure the object is still alive */
+       if (debugfs_positive(file->f_dentry)
+       && kref_get_unless_zero(kref))
+               ret = 0;
+       mutex_unlock(&parent->d_inode->i_mutex);
+       if (!ret) {
+               ret = single_open(file, show, data);
+               if (ret)
+                       kref_put(kref, release);
+       }
+out:
+       return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+       struct drbd_resource *resource = inode->i_private;
+       return drbd_single_open(file, in_flight_summary_show, resource,
+                               &resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+       struct drbd_resource *resource = inode->i_private;
+       kref_put(&resource->kref, drbd_destroy_resource);
+       return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+       .owner          = THIS_MODULE,
+       .open           = in_flight_summary_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+       struct dentry *dentry;
+       if (!drbd_debugfs_resources)
+               return;
+
+       dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res = dentry;
+
+       dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_volumes = dentry;
+
+       dentry = debugfs_create_dir("connections", resource->debugfs_res);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_connections = dentry;
+
+       dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+                       resource->debugfs_res, resource,
+                       &in_flight_summary_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       resource->debugfs_res_in_flight_summary = dentry;
+       return;
+
+fail:
+       drbd_debugfs_resource_cleanup(resource);
+       drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+       debugfs_remove(*dp);
+       *dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+       /* it is ok to call debugfs_remove(NULL) */
+       drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+       drbd_debugfs_remove(&resource->debugfs_res_connections);
+       drbd_debugfs_remove(&resource->debugfs_res_volumes);
+       drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+       const struct drbd_thread_timing_details *tdp,
+       unsigned long now)
+{
+       struct drbd_thread_timing_details td;
+       /* No locking...
+        * use temporary assignment to get at consistent data. */
+       do {
+               td = *tdp;
+       } while (td.cb_nr != tdp->cb_nr);
+       if (!td.cb_addr)
+               return;
+       seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+                       td.cb_nr,
+                       jiffies_to_msecs(now - td.start_jif),
+                       td.caller_fn, td.line,
+                       td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+               const char *title,
+               unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+       unsigned int start_idx;
+       unsigned int i;
+
+       seq_printf(m, "%s\n", title);
+       /* If not much is going on, this will result in natural ordering.
+        * If it is very busy, we will possibly skip events, or even see wrap
+        * arounds, which could only be avoided with locking.
+        */
+       start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+       for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+               seq_print_one_timing_detail(m, tdp+i, now);
+       for (i = 0; i < start_idx; i++)
+               seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_connection *connection = m->private;
+       unsigned long jif = jiffies;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, "n\tage\tcallsite\tfn\n");
+       seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+       seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+       return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       return drbd_single_open(file, callback_history_show, connection,
+                               &connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+       .owner          = THIS_MODULE,
+       .open           = callback_history_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_connection *connection = m->private;
+       unsigned long now = jiffies;
+       struct drbd_request *r1, *r2;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       spin_lock_irq(&connection->resource->req_lock);
+       r1 = connection->req_next;
+       if (r1)
+               seq_print_minor_vnr_req(m, r1, now);
+       r2 = connection->req_ack_pending;
+       if (r2 && r2 != r1) {
+               r1 = r2;
+               seq_print_minor_vnr_req(m, r1, now);
+       }
+       r2 = connection->req_not_net_done;
+       if (r2 && r2 != r1)
+               seq_print_minor_vnr_req(m, r2, now);
+       spin_unlock_irq(&connection->resource->req_lock);
+       return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       return drbd_single_open(file, connection_oldest_requests_show, connection,
+                               &connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+       struct drbd_connection *connection = inode->i_private;
+       kref_put(&connection->kref, drbd_destroy_connection);
+       return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+       .owner          = THIS_MODULE,
+       .open           = connection_oldest_requests_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+       struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+       struct dentry *dentry;
+       if (!conns_dir)
+               return;
+
+       /* Once we enable mutliple peers,
+        * these connections will have descriptive names.
+        * For now, it is just the one connection to the (only) "peer". */
+       dentry = debugfs_create_dir("peer", conns_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn = dentry;
+
+       dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+                       connection->debugfs_conn, connection,
+                       &connection_callback_history_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn_callback_history = dentry;
+
+       dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+                       connection->debugfs_conn, connection,
+                       &connection_oldest_requests_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       connection->debugfs_conn_oldest_requests = dentry;
+       return;
+
+fail:
+       drbd_debugfs_connection_cleanup(connection);
+       drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+       drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+       drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+       drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
+                 test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+                 test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+                 test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+                 );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       if (get_ldev_if_state(device, D_FAILED)) {
+               lc_seq_printf_stats(m, device->resync);
+               lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+               put_ldev(device);
+       }
+       return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       if (get_ldev_if_state(device, D_FAILED)) {
+               lc_seq_printf_stats(m, device->act_log);
+               lc_seq_dump_details(m, device->act_log, "", NULL);
+               put_ldev(device);
+       }
+       return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+       struct drbd_resource *resource = device->resource;
+       unsigned long now = jiffies;
+       struct drbd_request *r1, *r2;
+       int i;
+
+       /* BUMP me if you change the file format/content/presentation */
+       seq_printf(m, "v: %u\n\n", 0);
+
+       seq_puts(m, RQ_HDR);
+       spin_lock_irq(&resource->req_lock);
+       /* WRITE, then READ */
+       for (i = 1; i >= 0; --i) {
+               r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+                       struct drbd_request, req_pending_master_completion);
+               r2 = list_first_entry_or_null(&device->pending_completion[i],
+                       struct drbd_request, req_pending_local);
+               if (r1)
+                       seq_print_one_request(m, r1, now);
+               if (r2 && r2 != r1)
+                       seq_print_one_request(m, r2, now);
+       }
+       spin_unlock_irq(&resource->req_lock);
+       return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+       struct drbd_md *md;
+       enum drbd_uuid_index idx;
+
+       if (!get_ldev_if_state(device, D_FAILED))
+               return -ENODEV;
+
+       md = &device->ldev->md;
+       spin_lock_irq(&md->uuid_lock);
+       for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+               seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+       }
+       spin_unlock_irq(&md->uuid_lock);
+       put_ldev(device);
+       return 0;
+}
+
+#define drbd_debugfs_device_attr(name)                                         \
+static int device_ ## name ## _open(struct inode *inode, struct file *file)    \
+{                                                                              \
+       struct drbd_device *device = inode->i_private;                          \
+       return drbd_single_open(file, device_ ## name ## _show, device,         \
+                               &device->kref, drbd_destroy_device);            \
+}                                                                              \
+static int device_ ## name ## _release(struct inode *inode, struct file *file) \
+{                                                                              \
+       struct drbd_device *device = inode->i_private;                          \
+       kref_put(&device->kref, drbd_destroy_device);                           \
+       return single_release(inode, file);                                     \
+}                                                                              \
+static const struct file_operations device_ ## name ## _fops = {               \
+       .owner          = THIS_MODULE,                                          \
+       .open           = device_ ## name ## _open,                             \
+       .read           = seq_read,                                             \
+       .llseek         = seq_lseek,                                            \
+       .release        = device_ ## name ## _release,                          \
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+       struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+       char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+       char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+       char *slink_name = NULL;
+
+       struct dentry *dentry;
+       if (!vols_dir || !drbd_debugfs_minors)
+               return;
+
+       snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+       dentry = debugfs_create_dir(vnr_buf, vols_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       device->debugfs_vol = dentry;
+
+       snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+       slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+                       device->resource->name, device->vnr);
+       if (!slink_name)
+               goto fail;
+       dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+       kfree(slink_name);
+       slink_name = NULL;
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       device->debugfs_minor = dentry;
+
+#define DCF(name)      do {                                    \
+       dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,    \
+                       device->debugfs_vol, device,            \
+                       &device_ ## name ## _fops);             \
+       if (IS_ERR_OR_NULL(dentry))                             \
+               goto fail;                                      \
+       device->debugfs_vol_ ## name = dentry;                  \
+       } while (0)
+
+       DCF(oldest_requests);
+       DCF(act_log_extents);
+       DCF(resync_extents);
+       DCF(data_gen_id);
+#undef DCF
+       return;
+
+fail:
+       drbd_debugfs_device_cleanup(device);
+       drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+       drbd_debugfs_remove(&device->debugfs_minor);
+       drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+       drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+       drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+       drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+       drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+       struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+       struct dentry *dentry;
+       char vnr_buf[8];
+
+       if (!conn_dir)
+               return;
+
+       snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+       dentry = debugfs_create_dir(vnr_buf, conn_dir);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       peer_device->debugfs_peer_dev = dentry;
+       return;
+
+fail:
+       drbd_debugfs_peer_device_cleanup(peer_device);
+       drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+       drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+       seq_printf(m, "# %s\n", drbd_buildtag());
+       seq_printf(m, "VERSION=%s\n", REL_VERSION);
+       seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+       seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+       seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+       return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+       .owner = THIS_MODULE,
+       .open = drbd_version_open,
+       .llseek = seq_lseek,
+       .read = seq_read,
+       .release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+       drbd_debugfs_remove(&drbd_debugfs_resources);
+       drbd_debugfs_remove(&drbd_debugfs_minors);
+       drbd_debugfs_remove(&drbd_debugfs_version);
+       drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+       struct dentry *dentry;
+
+       dentry = debugfs_create_dir("drbd", NULL);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_root = dentry;
+
+       dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_version = dentry;
+
+       dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_resources = dentry;
+
+       dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+       if (IS_ERR_OR_NULL(dentry))
+               goto fail;
+       drbd_debugfs_minors = dentry;
+       return 0;
+
+fail:
+       drbd_debugfs_cleanup();
+       if (dentry)
+               return PTR_ERR(dentry);
+       else
+               return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644 (file)
index 0000000..8bee213
--- /dev/null
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
index a76ceb344d64e9411bbfd39c4db9189e519fac72..1a000016ccdfb8bfecf6769a09bb9c48a6aab215 100644 (file)
@@ -317,7 +317,63 @@ struct drbd_request {
 
        struct list_head tl_requests; /* ring list in the transfer log */
        struct bio *master_bio;       /* master bio pointer */
-       unsigned long start_time;
+
+       /* see struct drbd_device */
+       struct list_head req_pending_master_completion;
+       struct list_head req_pending_local;
+
+       /* for generic IO accounting */
+       unsigned long start_jif;
+
+       /* for DRBD internal statistics */
+
+       /* Minimal set of time stamps to determine if we wait for activity log
+        * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+        * we don't expect a DRBD request to be stalled for several month.
+        */
+
+       /* before actual request processing */
+       unsigned long in_actlog_jif;
+
+       /* local disk */
+       unsigned long pre_submit_jif;
+
+       /* per connection */
+       unsigned long pre_send_jif;
+       unsigned long acked_jif;
+       unsigned long net_done_jif;
+
+       /* Possibly even more detail to track each phase:
+        *  master_completion_jif
+        *      how long did it take to complete the master bio
+        *      (application visible latency)
+        *  allocated_jif
+        *      how long the master bio was blocked until we finally allocated
+        *      a tracking struct
+        *  in_actlog_jif
+        *      how long did we wait for activity log transactions
+        *
+        *  net_queued_jif
+        *      when did we finally queue it for sending
+        *  pre_send_jif
+        *      when did we start sending it
+        *  post_send_jif
+        *      how long did we block in the network stack trying to send it
+        *  acked_jif
+        *      when did we receive (or fake, in protocol A) a remote ACK
+        *  net_done_jif
+        *      when did we receive final acknowledgement (P_BARRIER_ACK),
+        *      or decide, e.g. on connection loss, that we do no longer expect
+        *      anything from this peer for this request.
+        *
+        *  pre_submit_jif
+        *  post_sub_jif
+        *      when did we start submiting to the lower level device,
+        *      and how long did we block in that submit function
+        *  local_completion_jif
+        *      how long did it take the lower level device to complete this request
+        */
+
 
        /* once it hits 0, we may complete the master_bio */
        atomic_t completion_ref;
@@ -366,6 +422,7 @@ struct drbd_peer_request {
        struct drbd_interval i;
        /* see comments on ee flag bits below */
        unsigned long flags;
+       unsigned long submit_jif;
        union {
                u64 block_id;
                struct digest_info *digest;
@@ -408,6 +465,17 @@ enum {
 
        /* Is set when net_conf had two_primaries set while creating this peer_req */
        __EE_IN_INTERVAL_TREE,
+
+       /* for debugfs: */
+       /* has this been submitted, or does it still wait for something else? */
+       __EE_SUBMITTED,
+
+       /* this is/was a write request */
+       __EE_WRITE,
+
+       /* this originates from application on peer
+        * (not some resync or verify or other DRBD internal request) */
+       __EE_APPLICATION,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
@@ -419,6 +487,9 @@ enum {
 #define EE_RESTART_REQUESTS    (1<<__EE_RESTART_REQUESTS)
 #define EE_SEND_WRITE_ACK      (1<<__EE_SEND_WRITE_ACK)
 #define EE_IN_INTERVAL_TREE    (1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED           (1<<__EE_SUBMITTED)
+#define EE_WRITE               (1<<__EE_WRITE)
+#define EE_APPLICATION         (1<<__EE_APPLICATION)
 
 /* flag bits per device */
 enum {
@@ -433,11 +504,11 @@ enum {
        CONSIDER_RESYNC,
 
        MD_NO_FUA,              /* Users wants us to not use FUA/FLUSH on meta data dev */
+
        SUSPEND_IO,             /* suspend application io */
        BITMAP_IO,              /* suspend application io;
                                   once no more io in flight, start bitmap io */
        BITMAP_IO_QUEUED,       /* Started bitmap IO */
-       GO_DISKLESS,            /* Disk is being detached, on io-error or admin request. */
        WAS_IO_ERROR,           /* Local disk failed, returned IO error */
        WAS_READ_ERROR,         /* Local disk READ failed (set additionally to the above) */
        FORCE_DETACH,           /* Force-detach from local disk, aborting any pending local IO */
@@ -450,6 +521,20 @@ enum {
        B_RS_H_DONE,            /* Before resync handler done (already executed) */
        DISCARD_MY_DATA,        /* discard_my_data flag per volume */
        READ_BALANCE_RR,
+
+       FLUSH_PENDING,          /* if set, device->flush_jif is when we submitted that flush
+                                * from drbd_flush_after_epoch() */
+
+       /* cleared only after backing device related structures have been destroyed. */
+       GOING_DISKLESS,         /* Disk is being detached, because of io-error, or admin request. */
+
+       /* to be used in drbd_device_post_work() */
+       GO_DISKLESS,            /* tell worker to schedule cleanup before detach */
+       DESTROY_DISK,           /* tell worker to close backing devices and destroy related structures. */
+       MD_SYNC,                /* tell worker to call drbd_md_sync() */
+       RS_START,               /* tell worker to start resync/OV */
+       RS_PROGRESS,            /* tell worker that resync made significant progress */
+       RS_DONE,                /* tell worker that resync is done */
 };
 
 struct drbd_bitmap; /* opaque for drbd_device */
@@ -531,6 +616,11 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
+       struct page *page;
+       unsigned long start_jif;        /* last call to drbd_md_get_buffer */
+       unsigned long submit_jif;       /* last _drbd_md_sync_page_io() submit */
+       const char *current_use;
+       atomic_t in_use;
        unsigned int done;
        int error;
 };
@@ -577,10 +667,18 @@ enum {
                                 * and potentially deadlock on, this drbd worker.
                                 */
        DISCONNECT_SENT,
+
+       DEVICE_WORK_PENDING,    /* tell worker that some device has pending work */
 };
 
 struct drbd_resource {
        char *name;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_res;
+       struct dentry *debugfs_res_volumes;
+       struct dentry *debugfs_res_connections;
+       struct dentry *debugfs_res_in_flight_summary;
+#endif
        struct kref kref;
        struct idr devices;             /* volume number to device mapping */
        struct list_head connections;
@@ -594,12 +692,28 @@ struct drbd_resource {
        unsigned susp_nod:1;            /* IO suspended because no data */
        unsigned susp_fen:1;            /* IO suspended because fence peer handler runs */
 
+       enum write_ordering_e write_ordering;
+
        cpumask_var_t cpu_mask;
 };
 
+struct drbd_thread_timing_details
+{
+       unsigned long start_jif;
+       void *cb_addr;
+       const char *caller_fn;
+       unsigned int line;
+       unsigned int cb_nr;
+};
+
 struct drbd_connection {
        struct list_head connections;
        struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_conn;
+       struct dentry *debugfs_conn_callback_history;
+       struct dentry *debugfs_conn_oldest_requests;
+#endif
        struct kref kref;
        struct idr peer_devices;        /* volume number to peer device mapping */
        enum drbd_conns cstate;         /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -636,7 +750,6 @@ struct drbd_connection {
        struct drbd_epoch *current_epoch;
        spinlock_t epoch_lock;
        unsigned int epochs;
-       enum write_ordering_e write_ordering;
        atomic_t current_tle_nr;        /* transfer log epoch number */
        unsigned current_tle_writes;    /* writes seen within this tl epoch */
 
@@ -645,9 +758,22 @@ struct drbd_connection {
        struct drbd_thread worker;
        struct drbd_thread asender;
 
+       /* cached pointers,
+        * so we can look up the oldest pending requests more quickly.
+        * protected by resource->req_lock */
+       struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+       struct drbd_request *req_ack_pending;
+       struct drbd_request *req_not_net_done;
+
        /* sender side */
        struct drbd_work_queue sender_work;
 
+#define DRBD_THREAD_DETAILS_HIST       16
+       unsigned int w_cb_nr; /* keeps counting up */
+       unsigned int r_cb_nr; /* keeps counting up */
+       struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+       struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
        struct {
                /* whether this sender thread
                 * has processed a single write yet. */
@@ -663,11 +789,22 @@ struct drbd_connection {
        } send;
 };
 
+void __update_timing_details(
+               struct drbd_thread_timing_details *tdp,
+               unsigned int *cb_nr,
+               void *cb,
+               const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+       __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+       __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
 struct submit_worker {
        struct workqueue_struct *wq;
        struct work_struct worker;
 
-       spinlock_t lock;
+       /* protected by ..->resource->req_lock */
        struct list_head writes;
 };
 
@@ -675,12 +812,29 @@ struct drbd_peer_device {
        struct list_head peer_devices;
        struct drbd_device *device;
        struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_peer_dev;
+#endif
 };
 
 struct drbd_device {
        struct drbd_resource *resource;
        struct list_head peer_devices;
-       int vnr;                        /* volume number within the connection */
+       struct list_head pending_bitmap_io;
+
+       unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_minor;
+       struct dentry *debugfs_vol;
+       struct dentry *debugfs_vol_oldest_requests;
+       struct dentry *debugfs_vol_act_log_extents;
+       struct dentry *debugfs_vol_resync_extents;
+       struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+       unsigned int vnr;       /* volume number within the connection */
+       unsigned int minor;     /* device minor number */
+
        struct kref kref;
 
        /* things that are stored as / read from meta data on disk */
@@ -697,19 +851,10 @@ struct drbd_device {
        unsigned long last_reattach_jif;
        struct drbd_work resync_work;
        struct drbd_work unplug_work;
-       struct drbd_work go_diskless;
-       struct drbd_work md_sync_work;
-       struct drbd_work start_resync_work;
        struct timer_list resync_timer;
        struct timer_list md_sync_timer;
        struct timer_list start_resync_timer;
        struct timer_list request_timer;
-#ifdef DRBD_DEBUG_MD_SYNC
-       struct {
-               unsigned int line;
-               const char* func;
-       } last_md_mark_dirty;
-#endif
 
        /* Used after attach while negotiating new disk state. */
        union drbd_state new_state_tmp;
@@ -724,6 +869,7 @@ struct drbd_device {
        unsigned int al_writ_cnt;
        unsigned int bm_writ_cnt;
        atomic_t ap_bio_cnt;     /* Requests we need to complete */
+       atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
        atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
        atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
        atomic_t unacked_cnt;    /* Need to send replies for */
@@ -733,6 +879,13 @@ struct drbd_device {
        struct rb_root read_requests;
        struct rb_root write_requests;
 
+       /* for statistics and timeouts */
+       /* [0] read, [1] write */
+       struct list_head pending_master_completion[2];
+       struct list_head pending_completion[2];
+
+       /* use checksums for *this* resync */
+       bool use_csums;
        /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
        unsigned long rs_total;
        /* number of resync blocks that failed in this run */
@@ -788,9 +941,7 @@ struct drbd_device {
        atomic_t pp_in_use;             /* allocated from page pool */
        atomic_t pp_in_use_by_net;      /* sendpage()d, still referenced by tcp */
        wait_queue_head_t ee_wait;
-       struct page *md_io_page;        /* one page buffer for md_io */
        struct drbd_md_io md_io;
-       atomic_t md_io_in_use;          /* protects the md_io, md_io_page and md_io_tmpp */
        spinlock_t al_lock;
        wait_queue_head_t al_wait;
        struct lru_cache *act_log;      /* activity log */
@@ -800,7 +951,6 @@ struct drbd_device {
        atomic_t packet_seq;
        unsigned int peer_seq;
        spinlock_t peer_seq_lock;
-       unsigned int minor;
        unsigned long comm_bm_set; /* communicated number of set bits. */
        struct bm_io_work bm_io_work;
        u64 ed_uuid; /* UUID of the exposed data */
@@ -824,6 +974,21 @@ struct drbd_device {
        struct submit_worker submit;
 };
 
+struct drbd_bm_aio_ctx {
+       struct drbd_device *device;
+       struct list_head list; /* on device->pending_bitmap_io */;
+       unsigned long start_jif;
+       atomic_t in_flight;
+       unsigned int done;
+       unsigned flags;
+#define BM_AIO_COPY_PAGES      1
+#define BM_AIO_WRITE_HINTED    2
+#define BM_AIO_WRITE_ALL_PAGES 4
+#define BM_AIO_READ            8
+       int error;
+       struct kref kref;
+};
+
 struct drbd_config_context {
        /* assigned from drbd_genlmsghdr */
        unsigned int minor;
@@ -949,7 +1114,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
@@ -966,13 +1131,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must
 extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
-#ifndef DRBD_DEBUG_MD_SYNC
 extern void drbd_md_mark_dirty(struct drbd_device *device);
-#else
-#define drbd_md_mark_dirty(m)  drbd_md_mark_dirty_(m, __LINE__ , __func__ )
-extern void drbd_md_mark_dirty_(struct drbd_device *device,
-               unsigned int line, const char *func);
-#endif
 extern void drbd_queue_bitmap_io(struct drbd_device *device,
                                 int (*io_fn)(struct drbd_device *),
                                 void (*done)(struct drbd_device *, int),
@@ -983,9 +1142,8 @@ extern int drbd_bitmap_io(struct drbd_device *device,
 extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
                int (*io_fn)(struct drbd_device *),
                char *why, enum bm_flag flags);
-extern int drbd_bmio_set_n_write(struct drbd_device *device);
-extern int drbd_bmio_clear_n_write(struct drbd_device *device);
-extern void drbd_ldev_destroy(struct drbd_device *device);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
 
 /* Meta data layout
  *
@@ -1105,17 +1263,21 @@ struct bm_extent {
 /* in which _bitmap_ extent (resp. sector) the bit for a certain
  * _storage_ sector is located in */
 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
 
-/* how much _storage_ sectors we have per bitmap sector */
+/* first storage sector a bitmap extent corresponds to */
 #define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
 #define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
 
 /* in one sector of the bitmap, we have this many activity_log extents. */
 #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
 
-#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
-#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
-
 /* the extent in "PER_EXTENT" below is an activity log extent
  * we need that many (long words/bytes) to store the bitmap
  *                  of one AL_EXTENT_SIZE chunk of storage.
@@ -1195,11 +1357,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device,
                const unsigned long s, const unsigned long e);
 extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
-extern int  drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
 extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
 extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
 extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
 extern size_t       drbd_bm_words(struct drbd_device *device);
@@ -1213,7 +1375,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
 extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
 extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
 extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
-extern int drbd_bm_rs_done(struct drbd_device *device);
 /* for receive_bitmap */
 extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
                size_t number, unsigned long *buffer);
@@ -1312,7 +1473,7 @@ enum determine_dev_size {
 extern enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
                                        enum drbd_role new_role,
                                        int force);
@@ -1333,7 +1494,7 @@ extern void resume_next_sg(struct drbd_device *device);
 extern void suspend_other_sg(struct drbd_device *device);
 extern int drbd_resync_finished(struct drbd_device *device);
 /* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
 extern void drbd_md_put_buffer(struct drbd_device *device);
 extern int drbd_md_sync_page_io(struct drbd_device *device,
                struct drbd_backing_dev *bdev, sector_t sector, int rw);
@@ -1380,7 +1541,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+               bool throttle_if_app_is_waiting);
 extern int drbd_submit_peer_request(struct drbd_device *,
                                    struct drbd_peer_request *, const unsigned,
                                    const int);
@@ -1464,10 +1626,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 {
        __release(local);
        if (!bio->bi_bdev) {
-               printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-                               "bio->bi_bdev == NULL\n",
-                      device_to_minor(device));
-               dump_stack();
+               drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
                bio_endio(bio, -ENODEV);
                return;
        }
@@ -1478,7 +1637,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
                generic_make_request(bio);
 }
 
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+                             enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
@@ -1489,9 +1649,9 @@ extern const char *drbd_role_str(enum drbd_role s);
 /* drbd_actlog.c */
 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
 extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
@@ -1501,14 +1661,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
 extern void drbd_rs_failed_io(struct drbd_device *device,
                sector_t sector, int size);
 extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
-extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
-               int size, const char *file, const unsigned int line);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+               enum update_sync_bits_mode mode,
+               const char *file, const unsigned int line);
 #define drbd_set_in_sync(device, sector, size) \
-       __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
-extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
-               int size, const char *file, const unsigned int line);
+       __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
 #define drbd_set_out_of_sync(device, sector, size) \
-       __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+       __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
+#define drbd_rs_failed_io(device, sector, size) \
+       __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
 extern void drbd_al_shrink(struct drbd_device *device);
 extern int drbd_initialize_al(struct drbd_device *, void *);
 
@@ -1764,25 +1927,38 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
 }
 
 static inline void
-drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 {
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
-       list_add(&w->list, &q->q);
+       list_add_tail(&w->list, &q->q);
        spin_unlock_irqrestore(&q->q_lock, flags);
        wake_up(&q->q_wait);
 }
 
 static inline void
-drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
 {
        unsigned long flags;
        spin_lock_irqsave(&q->q_lock, flags);
-       list_add_tail(&w->list, &q->q);
+       if (list_empty_careful(&w->list))
+               list_add_tail(&w->list, &q->q);
        spin_unlock_irqrestore(&q->q_lock, flags);
        wake_up(&q->q_wait);
 }
 
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+       if (!test_and_set_bit(work_bit, &device->flags)) {
+               struct drbd_connection *connection =
+                       first_peer_device(device)->connection;
+               struct drbd_work_queue *q = &connection->sender_work;
+               if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+                       wake_up(&q->q_wait);
+       }
+}
+
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
 static inline void wake_asender(struct drbd_connection *connection)
@@ -1859,7 +2035,7 @@ static inline void inc_ap_pending(struct drbd_device *device)
                        func, line,                                     \
                        atomic_read(&device->which))
 
-#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
 static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
 {
        if (atomic_dec_and_test(&device->ap_pending_cnt))
@@ -1878,7 +2054,7 @@ static inline void inc_rs_pending(struct drbd_device *device)
        atomic_inc(&device->rs_pending_cnt);
 }
 
-#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
 static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
 {
        atomic_dec(&device->rs_pending_cnt);
@@ -1899,20 +2075,29 @@ static inline void inc_unacked(struct drbd_device *device)
        atomic_inc(&device->unacked_cnt);
 }
 
-#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
 static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
 {
        atomic_dec(&device->unacked_cnt);
        ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
-#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
 static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
 {
        atomic_sub(n, &device->unacked_cnt);
        ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+       return
+          (connection_state == C_SYNC_SOURCE
+       ||  connection_state == C_SYNC_TARGET
+       ||  connection_state == C_PAUSED_SYNC_S
+       ||  connection_state == C_PAUSED_SYNC_T);
+}
+
 /**
  * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
  * @M:         DRBD device.
@@ -1924,6 +2109,11 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
 
 static inline void put_ldev(struct drbd_device *device)
 {
+       enum drbd_disk_state ds = device->state.disk;
+       /* We must check the state *before* the atomic_dec becomes visible,
+        * or we have a theoretical race where someone hitting zero,
+        * while state still D_FAILED, will then see D_DISKLESS in the
+        * condition below and calling into destroy, where he must not, yet. */
        int i = atomic_dec_return(&device->local_cnt);
 
        /* This may be called from some endio handler,
@@ -1932,15 +2122,13 @@ static inline void put_ldev(struct drbd_device *device)
        __release(local);
        D_ASSERT(device, i >= 0);
        if (i == 0) {
-               if (device->state.disk == D_DISKLESS)
+               if (ds == D_DISKLESS)
                        /* even internal references gone, safe to destroy */
-                       drbd_ldev_destroy(device);
-               if (device->state.disk == D_FAILED) {
+                       drbd_device_post_work(device, DESTROY_DISK);
+               if (ds == D_FAILED)
                        /* all application IO references gone. */
-                       if (!test_and_set_bit(GO_DISKLESS, &device->flags))
-                               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                                               &device->go_diskless);
-               }
+                       if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+                               drbd_device_post_work(device, GO_DISKLESS);
                wake_up(&device->misc_wait);
        }
 }
@@ -1964,54 +2152,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_
 extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
 #endif
 
-/* you must have an "get_ldev" reference */
-static inline void drbd_get_syncer_progress(struct drbd_device *device,
-               unsigned long *bits_left, unsigned int *per_mil_done)
-{
-       /* this is to break it at compile time when we change that, in case we
-        * want to support more than (1<<32) bits on a 32bit arch. */
-       typecheck(unsigned long, device->rs_total);
-
-       /* note: both rs_total and rs_left are in bits, i.e. in
-        * units of BM_BLOCK_SIZE.
-        * for the percentage, we don't care. */
-
-       if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
-               *bits_left = device->ov_left;
-       else
-               *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
-       /* >> 10 to prevent overflow,
-        * +1 to prevent division by zero */
-       if (*bits_left > device->rs_total) {
-               /* doh. maybe a logic bug somewhere.
-                * may also be just a race condition
-                * between this and a disconnect during sync.
-                * for now, just prevent in-kernel buffer overflow.
-                */
-               smp_rmb();
-               drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
-                               drbd_conn_str(device->state.conn),
-                               *bits_left, device->rs_total, device->rs_failed);
-               *per_mil_done = 0;
-       } else {
-               /* Make sure the division happens in long context.
-                * We allow up to one petabyte storage right now,
-                * at a granularity of 4k per bit that is 2**38 bits.
-                * After shift right and multiplication by 1000,
-                * this should still fit easily into a 32bit long,
-                * so we don't need a 64bit division on 32bit arch.
-                * Note: currently we don't support such large bitmaps on 32bit
-                * arch anyways, but no harm done to be prepared for it here.
-                */
-               unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
-               unsigned long left = *bits_left >> shift;
-               unsigned long total = 1UL + (device->rs_total >> shift);
-               unsigned long tmp = 1000UL - left * 1000UL/total;
-               *per_mil_done = tmp;
-       }
-}
-
-
 /* this throttles on-the-fly application requests
  * according to max_buffers settings;
  * maybe re-implement using semaphores? */
@@ -2201,25 +2341,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device)
        return QUEUE_ORDERED_NONE;
 }
 
-static inline void drbd_md_flush(struct drbd_device *device)
-{
-       int r;
-
-       if (device->ldev == NULL) {
-               drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
-               return;
-       }
-
-       if (test_bit(MD_NO_FUA, &device->flags))
-               return;
-
-       r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
-       if (r) {
-               set_bit(MD_NO_FUA, &device->flags);
-               drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
-       }
-}
-
 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
        return list_first_entry_or_null(&resource->connections,
index f38fcb00c10d6c39b09c334483d80296b463e608..f210543f05f4782674de8abb0b0053e0e4833bfb 100644 (file)
@@ -10,7 +10,9 @@ struct drbd_interval {
        unsigned int size;      /* size in bytes */
        sector_t end;           /* highest interval end in subtree */
        int local:1             /* local or remote request? */;
-       int waiting:1;
+       int waiting:1;          /* someone is waiting for this to complete */
+       int completed:1;        /* this has been completed already;
+                                * ignore for conflict detection */
 };
 
 static inline void drbd_clear_interval(struct drbd_interval *i)
index 960645c26e6fc1b107e0db1ae016fd15800d3514..9b465bb68487b5c0e5a51f72d161ae7ade749453 100644 (file)
 
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
+#include <linux/jiffies.h>
 #include <linux/drbd.h>
 #include <asm/uaccess.h>
 #include <asm/types.h>
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
-
 #include "drbd_vli.h"
+#include "drbd_debugfs.h"
 
 static DEFINE_MUTEX(drbd_main_mutex);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
 static void drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
              "Lars Ellenberg <lars@linbit.com>");
@@ -264,7 +265,7 @@ bail:
 
 /**
  * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @device:    DRBD device.
+ * @connection:        DRBD connection to operate on.
  * @what:       The action/event to perform with all request objects
  *
  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -662,6 +663,11 @@ static int __send_command(struct drbd_connection *connection, int vnr,
                            msg_flags);
        if (data && !err)
                err = drbd_send_all(connection, sock->socket, data, size, 0);
+       /* DRBD protocol "pings" are latency critical.
+        * This is supposed to trigger tcp_push_pending_frames() */
+       if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+               drbd_tcp_nodelay(sock->socket);
+
        return err;
 }
 
@@ -1636,7 +1642,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
        if (peer_device->connection->agreed_pro_version >= 100) {
                if (req->rq_state & RQ_EXP_RECEIVE_ACK)
                        dp_flags |= DP_SEND_RECEIVE_ACK;
-               if (req->rq_state & RQ_EXP_WRITE_ACK)
+               /* During resync, request an explicit write ack,
+                * even in protocol != C */
+               if (req->rq_state & RQ_EXP_WRITE_ACK
+               || (dp_flags & DP_MAY_SET_IN_SYNC))
                        dp_flags |= DP_SEND_WRITE_ACK;
        }
        p->dp_flags = cpu_to_be32(dp_flags);
@@ -1900,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
        drbd_set_defaults(device);
 
        atomic_set(&device->ap_bio_cnt, 0);
+       atomic_set(&device->ap_actlog_cnt, 0);
        atomic_set(&device->ap_pending_cnt, 0);
        atomic_set(&device->rs_pending_cnt, 0);
        atomic_set(&device->unacked_cnt, 0);
@@ -1908,7 +1918,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
        atomic_set(&device->rs_sect_in, 0);
        atomic_set(&device->rs_sect_ev, 0);
        atomic_set(&device->ap_in_flight, 0);
-       atomic_set(&device->md_io_in_use, 0);
+       atomic_set(&device->md_io.in_use, 0);
 
        mutex_init(&device->own_state_mutex);
        device->state_mutex = &device->own_state_mutex;
@@ -1924,17 +1934,15 @@ void drbd_init_set_defaults(struct drbd_device *device)
        INIT_LIST_HEAD(&device->resync_reads);
        INIT_LIST_HEAD(&device->resync_work.list);
        INIT_LIST_HEAD(&device->unplug_work.list);
-       INIT_LIST_HEAD(&device->go_diskless.list);
-       INIT_LIST_HEAD(&device->md_sync_work.list);
-       INIT_LIST_HEAD(&device->start_resync_work.list);
        INIT_LIST_HEAD(&device->bm_io_work.w.list);
+       INIT_LIST_HEAD(&device->pending_master_completion[0]);
+       INIT_LIST_HEAD(&device->pending_master_completion[1]);
+       INIT_LIST_HEAD(&device->pending_completion[0]);
+       INIT_LIST_HEAD(&device->pending_completion[1]);
 
        device->resync_work.cb  = w_resync_timer;
        device->unplug_work.cb  = w_send_write_hint;
-       device->go_diskless.cb  = w_go_diskless;
-       device->md_sync_work.cb = w_md_sync;
        device->bm_io_work.w.cb = w_bitmap_io;
-       device->start_resync_work.cb = w_start_resync;
 
        init_timer(&device->resync_timer);
        init_timer(&device->md_sync_timer);
@@ -1992,7 +2000,7 @@ void drbd_device_cleanup(struct drbd_device *device)
                drbd_bm_cleanup(device);
        }
 
-       drbd_free_bc(device->ldev);
+       drbd_free_ldev(device->ldev);
        device->ldev = NULL;
 
        clear_bit(AL_SUSPENDED, &device->flags);
@@ -2006,7 +2014,6 @@ void drbd_device_cleanup(struct drbd_device *device)
        D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
        D_ASSERT(device, list_empty(&device->resync_work.list));
        D_ASSERT(device, list_empty(&device->unplug_work.list));
-       D_ASSERT(device, list_empty(&device->go_diskless.list));
 
        drbd_set_defaults(device);
 }
@@ -2129,20 +2136,6 @@ Enomem:
        return -ENOMEM;
 }
 
-static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
-       void *unused)
-{
-       /* just so we have it.  you never know what interesting things we
-        * might want to do here some day...
-        */
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block drbd_notifier = {
-       .notifier_call = drbd_notify_sys,
-};
-
 static void drbd_release_all_peer_reqs(struct drbd_device *device)
 {
        int rr;
@@ -2173,7 +2166,7 @@ void drbd_destroy_device(struct kref *kref)
 {
        struct drbd_device *device = container_of(kref, struct drbd_device, kref);
        struct drbd_resource *resource = device->resource;
-       struct drbd_connection *connection;
+       struct drbd_peer_device *peer_device, *tmp_peer_device;
 
        del_timer_sync(&device->request_timer);
 
@@ -2187,7 +2180,7 @@ void drbd_destroy_device(struct kref *kref)
        if (device->this_bdev)
                bdput(device->this_bdev);
 
-       drbd_free_bc(device->ldev);
+       drbd_free_ldev(device->ldev);
        device->ldev = NULL;
 
        drbd_release_all_peer_reqs(device);
@@ -2200,15 +2193,20 @@ void drbd_destroy_device(struct kref *kref)
 
        if (device->bitmap) /* should no longer be there. */
                drbd_bm_cleanup(device);
-       __free_page(device->md_io_page);
+       __free_page(device->md_io.page);
        put_disk(device->vdisk);
        blk_cleanup_queue(device->rq_queue);
        kfree(device->rs_plan_s);
-       kfree(first_peer_device(device));
-       kfree(device);
 
-       for_each_connection(connection, resource)
-               kref_put(&connection->kref, drbd_destroy_connection);
+       /* not for_each_connection(connection, resource):
+        * those may have been cleaned up and disassociated already.
+        */
+       for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+               kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+               kfree(peer_device);
+       }
+       memset(device, 0xfd, sizeof(*device));
+       kfree(device);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2236,7 +2234,7 @@ static void do_retry(struct work_struct *ws)
        list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
                struct drbd_device *device = req->device;
                struct bio *bio = req->master_bio;
-               unsigned long start_time = req->start_time;
+               unsigned long start_jif = req->start_jif;
                bool expected;
 
                expected =
@@ -2271,10 +2269,12 @@ static void do_retry(struct work_struct *ws)
                /* We are not just doing generic_make_request(),
                 * as we want to keep the start_time information. */
                inc_ap_bio(device);
-               __drbd_make_request(device, bio, start_time);
+               __drbd_make_request(device, bio, start_jif);
        }
 }
 
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
 void drbd_restart_request(struct drbd_request *req)
 {
        unsigned long flags;
@@ -2298,6 +2298,7 @@ void drbd_destroy_resource(struct kref *kref)
        idr_destroy(&resource->devices);
        free_cpumask_var(resource->cpu_mask);
        kfree(resource->name);
+       memset(resource, 0xf2, sizeof(*resource));
        kfree(resource);
 }
 
@@ -2307,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)
 
        for_each_connection_safe(connection, tmp, resource) {
                list_del(&connection->connections);
+               drbd_debugfs_connection_cleanup(connection);
                kref_put(&connection->kref, drbd_destroy_connection);
        }
+       drbd_debugfs_resource_cleanup(resource);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2318,8 +2321,6 @@ static void drbd_cleanup(void)
        struct drbd_device *device;
        struct drbd_resource *resource, *tmp;
 
-       unregister_reboot_notifier(&drbd_notifier);
-
        /* first remove proc,
         * drbdsetup uses it's presence to detect
         * whether DRBD is loaded.
@@ -2335,6 +2336,7 @@ static void drbd_cleanup(void)
                destroy_workqueue(retry.wq);
 
        drbd_genl_unregister();
+       drbd_debugfs_cleanup();
 
        idr_for_each_entry(&drbd_devices, device, i)
                drbd_delete_device(device);
@@ -2350,7 +2352,7 @@ static void drbd_cleanup(void)
 
        idr_destroy(&drbd_devices);
 
-       printk(KERN_INFO "drbd: module cleanup done.\n");
+       pr_info("module cleanup done.\n");
 }
 
 /**
@@ -2539,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
        if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
                err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
                                   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+               if (err == -EOVERFLOW) {
+                       /* So what. mask it out. */
+                       cpumask_var_t tmp_cpu_mask;
+                       if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+                               cpumask_setall(tmp_cpu_mask);
+                               cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+                               drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+                                       res_opts->cpu_mask,
+                                       strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+                                       nr_cpu_ids);
+                               free_cpumask_var(tmp_cpu_mask);
+                               err = 0;
+                       }
+               }
                if (err) {
                        drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
                        /* retcode = ERR_CPU_MASK_PARSE; */
@@ -2579,10 +2595,12 @@ struct drbd_resource *drbd_create_resource(const char *name)
        kref_init(&resource->kref);
        idr_init(&resource->devices);
        INIT_LIST_HEAD(&resource->connections);
+       resource->write_ordering = WO_bdev_flush;
        list_add_tail_rcu(&resource->resources, &drbd_resources);
        mutex_init(&resource->conf_update);
        mutex_init(&resource->adm_mutex);
        spin_lock_init(&resource->req_lock);
+       drbd_debugfs_resource_add(resource);
        return resource;
 
 fail_free_name:
@@ -2593,7 +2611,7 @@ fail:
        return NULL;
 }
 
-/* caller must be under genl_lock() */
+/* caller must be under adm_mutex */
 struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 {
        struct drbd_resource *resource;
@@ -2617,7 +2635,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
        INIT_LIST_HEAD(&connection->current_epoch->list);
        connection->epochs = 1;
        spin_lock_init(&connection->epoch_lock);
-       connection->write_ordering = WO_bdev_flush;
 
        connection->send.seen_any_write_yet = false;
        connection->send.current_epoch_nr = 0;
@@ -2652,6 +2669,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 
        kref_get(&resource->kref);
        list_add_tail_rcu(&connection->connections, &resource->connections);
+       drbd_debugfs_connection_add(connection);
        return connection;
 
 fail_resource:
@@ -2680,6 +2698,7 @@ void drbd_destroy_connection(struct kref *kref)
        drbd_free_socket(&connection->data);
        kfree(connection->int_dig_in);
        kfree(connection->int_dig_vv);
+       memset(connection, 0xfc, sizeof(*connection));
        kfree(connection);
        kref_put(&resource->kref, drbd_destroy_resource);
 }
@@ -2694,7 +2713,6 @@ static int init_submitter(struct drbd_device *device)
                return -ENOMEM;
 
        INIT_WORK(&device->submit.worker, do_submit);
-       spin_lock_init(&device->submit.lock);
        INIT_LIST_HEAD(&device->submit.writes);
        return 0;
 }
@@ -2764,8 +2782,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
        blk_queue_merge_bvec(q, drbd_merge_bvec);
        q->queue_lock = &resource->req_lock;
 
-       device->md_io_page = alloc_page(GFP_KERNEL);
-       if (!device->md_io_page)
+       device->md_io.page = alloc_page(GFP_KERNEL);
+       if (!device->md_io.page)
                goto out_no_io_page;
 
        if (drbd_bm_init(device))
@@ -2794,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
        kref_get(&device->kref);
 
        INIT_LIST_HEAD(&device->peer_devices);
+       INIT_LIST_HEAD(&device->pending_bitmap_io);
        for_each_connection(connection, resource) {
                peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
                if (!peer_device)
@@ -2829,7 +2848,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                for_each_peer_device(peer_device, device)
                        drbd_connected(peer_device);
        }
-
+       /* move to create_peer_device() */
+       for_each_peer_device(peer_device, device)
+               drbd_debugfs_peer_device_add(peer_device);
+       drbd_debugfs_device_add(device);
        return NO_ERROR;
 
 out_idr_remove_vol:
@@ -2853,7 +2875,7 @@ out_idr_remove_minor:
 out_no_minor_idr:
        drbd_bm_cleanup(device);
 out_no_bitmap:
-       __free_page(device->md_io_page);
+       __free_page(device->md_io.page);
 out_no_io_page:
        put_disk(disk);
 out_no_disk:
@@ -2868,8 +2890,13 @@ void drbd_delete_device(struct drbd_device *device)
 {
        struct drbd_resource *resource = device->resource;
        struct drbd_connection *connection;
+       struct drbd_peer_device *peer_device;
        int refs = 3;
 
+       /* move to free_peer_device() */
+       for_each_peer_device(peer_device, device)
+               drbd_debugfs_peer_device_cleanup(peer_device);
+       drbd_debugfs_device_cleanup(device);
        for_each_connection(connection, resource) {
                idr_remove(&connection->peer_devices, device->vnr);
                refs++;
@@ -2881,13 +2908,12 @@ void drbd_delete_device(struct drbd_device *device)
        kref_sub(&device->kref, refs, drbd_destroy_device);
 }
 
-int __init drbd_init(void)
+static int __init drbd_init(void)
 {
        int err;
 
        if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
-               printk(KERN_ERR
-                      "drbd: invalid minor_count (%d)\n", minor_count);
+               pr_err("invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
                return -EINVAL;
 #else
@@ -2897,14 +2923,11 @@ int __init drbd_init(void)
 
        err = register_blkdev(DRBD_MAJOR, "drbd");
        if (err) {
-               printk(KERN_ERR
-                      "drbd: unable to register block device major %d\n",
+               pr_err("unable to register block device major %d\n",
                       DRBD_MAJOR);
                return err;
        }
 
-       register_reboot_notifier(&drbd_notifier);
-
        /*
         * allocate all necessary structs
         */
@@ -2918,7 +2941,7 @@ int __init drbd_init(void)
 
        err = drbd_genl_register();
        if (err) {
-               printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+               pr_err("unable to register generic netlink family\n");
                goto fail;
        }
 
@@ -2929,38 +2952,39 @@ int __init drbd_init(void)
        err = -ENOMEM;
        drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
        if (!drbd_proc) {
-               printk(KERN_ERR "drbd: unable to register proc file\n");
+               pr_err("unable to register proc file\n");
                goto fail;
        }
 
        retry.wq = create_singlethread_workqueue("drbd-reissue");
        if (!retry.wq) {
-               printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+               pr_err("unable to create retry workqueue\n");
                goto fail;
        }
        INIT_WORK(&retry.worker, do_retry);
        spin_lock_init(&retry.lock);
        INIT_LIST_HEAD(&retry.writes);
 
-       printk(KERN_INFO "drbd: initialized. "
+       if (drbd_debugfs_init())
+               pr_notice("failed to initialize debugfs -- will not be available\n");
+
+       pr_info("initialized. "
               "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
               API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
-       printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
-       printk(KERN_INFO "drbd: registered as block device major %d\n",
-               DRBD_MAJOR);
-
+       pr_info("%s\n", drbd_buildtag());
+       pr_info("registered as block device major %d\n", DRBD_MAJOR);
        return 0; /* Success! */
 
 fail:
        drbd_cleanup();
        if (err == -ENOMEM)
-               printk(KERN_ERR "drbd: ran out of memory\n");
+               pr_err("ran out of memory\n");
        else
-               printk(KERN_ERR "drbd: initialization failure\n");
+               pr_err("initialization failure\n");
        return err;
 }
 
-void drbd_free_bc(struct drbd_backing_dev *ldev)
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
 {
        if (ldev == NULL)
                return;
@@ -2972,24 +2996,29 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
        kfree(ldev);
 }
 
-void drbd_free_sock(struct drbd_connection *connection)
+static void drbd_free_one_sock(struct drbd_socket *ds)
 {
-       if (connection->data.socket) {
-               mutex_lock(&connection->data.mutex);
-               kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
-               sock_release(connection->data.socket);
-               connection->data.socket = NULL;
-               mutex_unlock(&connection->data.mutex);
-       }
-       if (connection->meta.socket) {
-               mutex_lock(&connection->meta.mutex);
-               kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
-               sock_release(connection->meta.socket);
-               connection->meta.socket = NULL;
-               mutex_unlock(&connection->meta.mutex);
+       struct socket *s;
+       mutex_lock(&ds->mutex);
+       s = ds->socket;
+       ds->socket = NULL;
+       mutex_unlock(&ds->mutex);
+       if (s) {
+               /* so debugfs does not need to mutex_lock() */
+               synchronize_rcu();
+               kernel_sock_shutdown(s, SHUT_RDWR);
+               sock_release(s);
        }
 }
 
+void drbd_free_sock(struct drbd_connection *connection)
+{
+       if (connection->data.socket)
+               drbd_free_one_sock(&connection->data);
+       if (connection->meta.socket)
+               drbd_free_one_sock(&connection->meta);
+}
+
 /* meta data management */
 
 void conn_md_sync(struct drbd_connection *connection)
@@ -3093,7 +3122,7 @@ void drbd_md_sync(struct drbd_device *device)
        if (!get_ldev_if_state(device, D_FAILED))
                return;
 
-       buffer = drbd_md_get_buffer(device);
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer)
                goto out;
 
@@ -3253,7 +3282,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
        if (device->state.disk != D_DISKLESS)
                return ERR_DISK_CONFIGURED;
 
-       buffer = drbd_md_get_buffer(device);
+       buffer = drbd_md_get_buffer(device, __func__);
        if (!buffer)
                return ERR_NOMEM;
 
@@ -3466,23 +3495,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
  *
  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_set_n_write(struct drbd_device *device)
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
 {
        int rv = -EIO;
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               drbd_md_set_flag(device, MDF_FULL_SYNC);
-               drbd_md_sync(device);
-               drbd_bm_set_all(device);
-
-               rv = drbd_bm_write(device);
+       drbd_md_set_flag(device, MDF_FULL_SYNC);
+       drbd_md_sync(device);
+       drbd_bm_set_all(device);
 
-               if (!rv) {
-                       drbd_md_clear_flag(device, MDF_FULL_SYNC);
-                       drbd_md_sync(device);
-               }
+       rv = drbd_bm_write(device);
 
-               put_ldev(device);
+       if (!rv) {
+               drbd_md_clear_flag(device, MDF_FULL_SYNC);
+               drbd_md_sync(device);
        }
 
        return rv;
@@ -3494,18 +3519,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device)
  *
  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_clear_n_write(struct drbd_device *device)
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
 {
-       int rv = -EIO;
-
        drbd_resume_al(device);
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               drbd_bm_clear_all(device);
-               rv = drbd_bm_write(device);
-               put_ldev(device);
-       }
-
-       return rv;
+       drbd_bm_clear_all(device);
+       return drbd_bm_write(device);
 }
 
 static int w_bitmap_io(struct drbd_work *w, int unused)
@@ -3537,61 +3555,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
        return 0;
 }
 
-void drbd_ldev_destroy(struct drbd_device *device)
-{
-       lc_destroy(device->resync);
-       device->resync = NULL;
-       lc_destroy(device->act_log);
-       device->act_log = NULL;
-       __no_warn(local,
-               drbd_free_bc(device->ldev);
-               device->ldev = NULL;);
-
-       clear_bit(GO_DISKLESS, &device->flags);
-}
-
-static int w_go_diskless(struct drbd_work *w, int unused)
-{
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, go_diskless);
-
-       D_ASSERT(device, device->state.disk == D_FAILED);
-       /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
-        * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
-        * the protected members anymore, though, so once put_ldev reaches zero
-        * again, it will be safe to free them. */
-
-       /* Try to write changed bitmap pages, read errors may have just
-        * set some bits outside the area covered by the activity log.
-        *
-        * If we have an IO error during the bitmap writeout,
-        * we will want a full sync next time, just in case.
-        * (Do we want a specific meta data flag for this?)
-        *
-        * If that does not make it to stable storage either,
-        * we cannot do anything about that anymore.
-        *
-        * We still need to check if both bitmap and ldev are present, we may
-        * end up here after a failed attach, before ldev was even assigned.
-        */
-       if (device->bitmap && device->ldev) {
-               /* An interrupted resync or similar is allowed to recounts bits
-                * while we detach.
-                * Any modifications would not be expected anymore, though.
-                */
-               if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
-                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
-                       if (test_bit(WAS_READ_ERROR, &device->flags)) {
-                               drbd_md_set_flag(device, MDF_FULL_SYNC);
-                               drbd_md_sync(device);
-                       }
-               }
-       }
-
-       drbd_force_state(device, NS(disk, D_DISKLESS));
-       return 0;
-}
-
 /**
  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
  * @device:    DRBD device.
@@ -3603,6 +3566,9 @@ static int w_go_diskless(struct drbd_work *w, int unused)
  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
  * called from worker context. It MUST NOT be used while a previous such
  * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
  */
 void drbd_queue_bitmap_io(struct drbd_device *device,
                          int (*io_fn)(struct drbd_device *),
@@ -3685,25 +3651,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
 static void md_sync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
-
-       /* must not double-queue! */
-       if (list_empty(&device->md_sync_work.list))
-               drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-                                     &device->md_sync_work);
-}
-
-static int w_md_sync(struct drbd_work *w, int unused)
-{
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, md_sync_work);
-
-       drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
-#ifdef DEBUG
-       drbd_warn(device, "last md_mark_dirty: %s:%u\n",
-               device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
-#endif
-       drbd_md_sync(device);
-       return 0;
+       drbd_device_post_work(device, MD_SYNC);
 }
 
 const char *cmdname(enum drbd_packet cmd)
index 3f2e1673808053a4de70077735a67723f9955b64..1cd47df44bdaf57d74c1ff3adede40835f1b1f29 100644 (file)
@@ -23,6 +23,8 @@
 
  */
 
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/drbd.h>
 #include <linux/in.h>
@@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 {
        genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
        if (genlmsg_reply(skb, info))
-               printk(KERN_ERR "drbd: error sending genl reply\n");
+               pr_err("error sending genl reply\n");
 }
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
@@ -558,8 +560,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection)
 }
 
 enum drbd_state_rv
-drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        const int max_tries = 4;
        enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
        struct net_conf *nc;
@@ -607,7 +611,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
                        D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
 
-                       if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
+                       if (conn_try_outdate_peer(connection)) {
                                val.disk = D_UP_TO_DATE;
                                mask.disk = D_MASK;
                        }
@@ -617,7 +621,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                if (rv == SS_NOTHING_TO_DO)
                        goto out;
                if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-                       if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+                       if (!conn_try_outdate_peer(connection) && force) {
                                drbd_warn(device, "Forced into split brain situation!\n");
                                mask.pdsk = D_MASK;
                                val.pdsk  = D_OUTDATED;
@@ -630,7 +634,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
                           retry at most once more in this case. */
                        int timeo;
                        rcu_read_lock();
-                       nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+                       nc = rcu_dereference(connection->net_conf);
                        timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
                        rcu_read_unlock();
                        schedule_timeout_interruptible(timeo);
@@ -659,19 +663,17 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
        /* FIXME also wait for all pending P_BARRIER_ACK? */
 
        if (new_role == R_SECONDARY) {
-               set_disk_ro(device->vdisk, true);
                if (get_ldev(device)) {
                        device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
                        put_ldev(device);
                }
        } else {
-               /* Called from drbd_adm_set_role only.
-                * We are still holding the conf_update mutex. */
-               nc = first_peer_device(device)->connection->net_conf;
+               mutex_lock(&device->resource->conf_update);
+               nc = connection->net_conf;
                if (nc)
                        nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+               mutex_unlock(&device->resource->conf_update);
 
-               set_disk_ro(device->vdisk, false);
                if (get_ldev(device)) {
                        if (((device->state.conn < C_CONNECTED ||
                               device->state.pdsk <= D_FAILED)
@@ -689,12 +691,12 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
        if (device->state.conn >= C_WF_REPORT_PARAMS) {
                /* if this was forced, we should consider sync */
                if (forced)
-                       drbd_send_uuids(first_peer_device(device));
-               drbd_send_current_state(first_peer_device(device));
+                       drbd_send_uuids(peer_device);
+               drbd_send_current_state(peer_device);
        }
 
        drbd_md_sync(device);
-
+       set_disk_ro(device->vdisk, new_role == R_SECONDARY);
        kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 out:
        mutex_unlock(device->state_mutex);
@@ -891,7 +893,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
         * still lock the act_log to not trigger ASSERTs there.
         */
        drbd_suspend_io(device);
-       buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+       buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
        if (!buffer) {
                drbd_resume_io(device);
                return DS_ERROR;
@@ -971,6 +973,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
        if (la_size_changed || md_moved || rs) {
                u32 prev_flags;
 
+               /* We do some synchronous IO below, which may take some time.
+                * Clear the timer, to avoid scary "timer expired!" messages,
+                * "Superblock" is written out at least twice below, anyways. */
+               del_timer(&device->md_sync_timer);
                drbd_al_shrink(device); /* All extents inactive. */
 
                prev_flags = md->flags;
@@ -1116,15 +1122,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
        return 0;
 }
 
-static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+                                  unsigned int max_bio_size)
 {
        struct request_queue * const q = device->rq_queue;
        unsigned int max_hw_sectors = max_bio_size >> 9;
        unsigned int max_segments = 0;
        struct request_queue *b = NULL;
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               b = device->ldev->backing_bdev->bd_disk->queue;
+       if (bdev) {
+               b = bdev->backing_bdev->bd_disk->queue;
 
                max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
                rcu_read_lock();
@@ -1169,11 +1176,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
                                 b->backing_dev_info.ra_pages);
                        q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
                }
-               put_ldev(device);
        }
 }
 
-void drbd_reconsider_max_bio_size(struct drbd_device *device)
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
 {
        unsigned int now, new, local, peer;
 
@@ -1181,10 +1187,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
        local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
        peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
 
-       if (get_ldev_if_state(device, D_ATTACHING)) {
-               local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+       if (bdev) {
+               local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
                device->local_max_bio_size = local;
-               put_ldev(device);
        }
        local = min(local, DRBD_MAX_BIO_SIZE);
 
@@ -1217,7 +1222,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
        if (new != now)
                drbd_info(device, "max BIO size = %u\n", new);
 
-       drbd_setup_queue_param(device, new);
+       drbd_setup_queue_param(device, bdev, new);
 }
 
 /* Starts the worker thread */
@@ -1299,6 +1304,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
        return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
 }
 
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+       return  a->disk_barrier != b->disk_barrier ||
+               a->disk_flushes != b->disk_flushes ||
+               a->disk_drain != b->disk_drain;
+}
+
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -1405,7 +1417,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
        else
                set_bit(MD_NO_FUA, &device->flags);
 
-       drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+       if (write_ordering_changed(old_disk_conf, new_disk_conf))
+               drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
 
        drbd_md_sync(device);
 
@@ -1440,6 +1453,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
        struct drbd_device *device;
+       struct drbd_peer_device *peer_device;
+       struct drbd_connection *connection;
        int err;
        enum drbd_ret_code retcode;
        enum determine_dev_size dd;
@@ -1462,7 +1477,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
        device = adm_ctx.device;
        mutex_lock(&adm_ctx.resource->adm_mutex);
-       conn_reconfig_start(first_peer_device(device)->connection);
+       peer_device = first_peer_device(device);
+       connection = peer_device ? peer_device->connection : NULL;
+       conn_reconfig_start(connection);
 
        /* if you want to reconfigure, please tear down first */
        if (device->state.disk > D_DISKLESS) {
@@ -1473,7 +1490,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
         * drbd_ldev_destroy is done already, we may end up here very fast,
         * e.g. if someone calls attach from the on-io-error handler,
         * to realize a "hot spare" feature (not that I'd recommend that) */
-       wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
+       wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
 
        /* make sure there is no leftover from previous force-detach attempts */
        clear_bit(FORCE_DETACH, &device->flags);
@@ -1529,7 +1546,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto fail;
 
        rcu_read_lock();
-       nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+       nc = rcu_dereference(connection->net_conf);
        if (nc) {
                if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
                        rcu_read_unlock();
@@ -1649,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
         */
        wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
        /* and for any other previously queued work */
-       drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+       drbd_flush_workqueue(&connection->sender_work);
 
        rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
        retcode = rv;  /* FIXME: Type mismatch. */
@@ -1710,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        new_disk_conf = NULL;
        new_plan = NULL;
 
-       drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+       drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
 
        if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
                set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1726,7 +1743,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        device->read_cnt = 0;
        device->writ_cnt = 0;
 
-       drbd_reconsider_max_bio_size(device);
+       drbd_reconsider_max_bio_size(device, device->ldev);
 
        /* If I am currently not R_PRIMARY,
         * but meta data primary indicator is set,
@@ -1845,7 +1862,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
        kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
        put_ldev(device);
-       conn_reconfig_done(first_peer_device(device)->connection);
+       conn_reconfig_done(connection);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -1856,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        drbd_force_state(device, NS(disk, D_DISKLESS));
        drbd_md_sync(device);
  fail:
-       conn_reconfig_done(first_peer_device(device)->connection);
+       conn_reconfig_done(connection);
        if (nbc) {
                if (nbc->backing_bdev)
                        blkdev_put(nbc->backing_bdev,
@@ -1888,7 +1905,7 @@ static int adm_detach(struct drbd_device *device, int force)
        }
 
        drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-       drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+       drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
        retcode = drbd_request_state(device, NS(disk, D_FAILED));
        drbd_md_put_buffer(device);
        /* D_FAILED will transition to DISKLESS. */
@@ -2654,8 +2671,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
        if (retcode != NO_ERROR)
                goto out;
 
-       mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
+       if (!get_ldev(device)) {
+               retcode = ERR_NO_DISK;
+               goto out;
+       }
+
+       mutex_lock(&adm_ctx.resource->adm_mutex);
 
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
@@ -2679,6 +2701,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
                retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
        drbd_resume_io(device);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
+       put_ldev(device);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -2704,7 +2727,7 @@ out:
        return 0;
 }
 
-static int drbd_bmio_set_susp_al(struct drbd_device *device)
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
 {
        int rv;
 
@@ -2725,8 +2748,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
        if (retcode != NO_ERROR)
                goto out;
 
-       mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
+       if (!get_ldev(device)) {
+               retcode = ERR_NO_DISK;
+               goto out;
+       }
+
+       mutex_lock(&adm_ctx.resource->adm_mutex);
 
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync.
@@ -2753,6 +2781,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
                retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
        drbd_resume_io(device);
        mutex_unlock(&adm_ctx.resource->adm_mutex);
+       put_ldev(device);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
@@ -2892,7 +2921,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc
        return list_first_entry(&resource->connections, struct drbd_connection, connections);
 }
 
-int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
                const struct sib_info *sib)
 {
        struct drbd_resource *resource = device->resource;
@@ -3622,13 +3651,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
        unsigned seq;
        int err = -ENOMEM;
 
-       if (sib->sib_reason == SIB_SYNC_PROGRESS) {
-               if (time_after(jiffies, device->rs_last_bcast + HZ))
-                       device->rs_last_bcast = jiffies;
-               else
-                       return;
-       }
-
        seq = atomic_inc_return(&drbd_genl_seq);
        msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
        if (!msg)
index 89736bdbbc7044aedaaacbd5dc9c858ca3933596..06e6147c76013602d2591cab96bb239afb073d17 100644 (file)
@@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
                seq_printf(seq, "%ld", v);
 }
 
+static void drbd_get_syncer_progress(struct drbd_device *device,
+               union drbd_dev_state state, unsigned long *rs_total,
+               unsigned long *bits_left, unsigned int *per_mil_done)
+{
+       /* this is to break it at compile time when we change that, in case we
+        * want to support more than (1<<32) bits on a 32bit arch. */
+       typecheck(unsigned long, device->rs_total);
+       *rs_total = device->rs_total;
+
+       /* note: both rs_total and rs_left are in bits, i.e. in
+        * units of BM_BLOCK_SIZE.
+        * for the percentage, we don't care. */
+
+       if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+               *bits_left = device->ov_left;
+       else
+               *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+       /* >> 10 to prevent overflow,
+        * +1 to prevent division by zero */
+       if (*bits_left > *rs_total) {
+               /* D'oh. Maybe a logic bug somewhere.  More likely just a race
+                * between state change and reset of rs_total.
+                */
+               *bits_left = *rs_total;
+               *per_mil_done = *rs_total ? 0 : 1000;
+       } else {
+               /* Make sure the division happens in long context.
+                * We allow up to one petabyte storage right now,
+                * at a granularity of 4k per bit that is 2**38 bits.
+                * After shift right and multiplication by 1000,
+                * this should still fit easily into a 32bit long,
+                * so we don't need a 64bit division on 32bit arch.
+                * Note: currently we don't support such large bitmaps on 32bit
+                * arch anyways, but no harm done to be prepared for it here.
+                */
+               unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+               unsigned long left = *bits_left >> shift;
+               unsigned long total = 1UL + (*rs_total >> shift);
+               unsigned long tmp = 1000UL - left * 1000UL/total;
+               *per_mil_done = tmp;
+       }
+}
+
+
 /*lge
  * progress bars shamelessly adapted from driver/md/md.c
  * output looks like
  *     [=====>..............] 33.5% (23456/123456)
  *     finish: 2:20:20 speed: 6,345 (6,456) K/sec
  */
-static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+               union drbd_dev_state state)
 {
-       unsigned long db, dt, dbdt, rt, rs_left;
+       unsigned long db, dt, dbdt, rt, rs_total, rs_left;
        unsigned int res;
        int i, x, y;
        int stalled = 0;
 
-       drbd_get_syncer_progress(device, &rs_left, &res);
+       drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
 
        x = res/50;
        y = 20-x;
@@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
                seq_printf(seq, ".");
        seq_printf(seq, "] ");
 
-       if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+       if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
                seq_printf(seq, "verified:");
        else
                seq_printf(seq, "sync'ed:");
        seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
 
        /* if more than a few GB, display in MB */
-       if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+       if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
                seq_printf(seq, "(%lu/%lu)M",
                            (unsigned long) Bit2KB(rs_left >> 10),
-                           (unsigned long) Bit2KB(device->rs_total >> 10));
+                           (unsigned long) Bit2KB(rs_total >> 10));
        else
                seq_printf(seq, "(%lu/%lu)K\n\t",
                            (unsigned long) Bit2KB(rs_left),
-                           (unsigned long) Bit2KB(device->rs_total));
+                           (unsigned long) Bit2KB(rs_total));
 
        /* see drivers/md/md.c
         * We do not want to overflow, so the order of operands and
@@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
        dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
        if (dt == 0)
                dt = 1;
-       db = device->rs_total - rs_left;
+       db = rs_total - rs_left;
        dbdt = Bit2KB(db/dt);
        seq_printf_with_thousands_grouping(seq, dbdt);
        seq_printf(seq, ")");
 
-       if (device->state.conn == C_SYNC_TARGET ||
-           device->state.conn == C_VERIFY_S) {
+       if (state.conn == C_SYNC_TARGET ||
+           state.conn == C_VERIFY_S) {
                seq_printf(seq, " want: ");
                seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
        }
@@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
                unsigned long bm_bits = drbd_bm_bits(device);
                unsigned long bit_pos;
                unsigned long long stop_sector = 0;
-               if (device->state.conn == C_VERIFY_S ||
-                   device->state.conn == C_VERIFY_T) {
+               if (state.conn == C_VERIFY_S ||
+                   state.conn == C_VERIFY_T) {
                        bit_pos = bm_bits - device->ov_left;
                        if (verify_can_do_stop_sector(device))
                                stop_sector = device->ov_stop_sector;
@@ -188,22 +233,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
        }
 }
 
-static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
-{
-       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
-
-       seq_printf(seq, "%5d %s %s\n", bme->rs_left,
-                  bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
-                  bme->flags & BME_LOCKED ? "LOCKED" : "------"
-                  );
-}
-
 static int drbd_seq_show(struct seq_file *seq, void *v)
 {
        int i, prev_i = -1;
        const char *sn;
        struct drbd_device *device;
        struct net_conf *nc;
+       union drbd_dev_state state;
        char wp;
 
        static char write_ordering_chars[] = {
@@ -241,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                        seq_printf(seq, "\n");
                prev_i = i;
 
-               sn = drbd_conn_str(device->state.conn);
+               state = device->state;
+               sn = drbd_conn_str(state.conn);
 
-               if (device->state.conn == C_STANDALONE &&
-                   device->state.disk == D_DISKLESS &&
-                   device->state.role == R_SECONDARY) {
+               if (state.conn == C_STANDALONE &&
+                   state.disk == D_DISKLESS &&
+                   state.role == R_SECONDARY) {
                        seq_printf(seq, "%2d: cs:Unconfigured\n", i);
                } else {
                        /* reset device->congestion_reason */
@@ -258,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                           "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
                           "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
                           i, sn,
-                          drbd_role_str(device->state.role),
-                          drbd_role_str(device->state.peer),
-                          drbd_disk_str(device->state.disk),
-                          drbd_disk_str(device->state.pdsk),
+                          drbd_role_str(state.role),
+                          drbd_role_str(state.peer),
+                          drbd_disk_str(state.disk),
+                          drbd_disk_str(state.pdsk),
                           wp,
                           drbd_suspended(device) ? 's' : 'r',
-                          device->state.aftr_isp ? 'a' : '-',
-                          device->state.peer_isp ? 'p' : '-',
-                          device->state.user_isp ? 'u' : '-',
+                          state.aftr_isp ? 'a' : '-',
+                          state.peer_isp ? 'p' : '-',
+                          state.user_isp ? 'u' : '-',
                           device->congestion_reason ?: '-',
                           test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
                           device->send_cnt/2,
@@ -281,17 +318,17 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                           atomic_read(&device->unacked_cnt),
                           atomic_read(&device->ap_bio_cnt),
                           first_peer_device(device)->connection->epochs,
-                          write_ordering_chars[first_peer_device(device)->connection->write_ordering]
+                          write_ordering_chars[device->resource->write_ordering]
                        );
                        seq_printf(seq, " oos:%llu\n",
                                   Bit2KB((unsigned long long)
                                           drbd_bm_total_weight(device)));
                }
-               if (device->state.conn == C_SYNC_SOURCE ||
-                   device->state.conn == C_SYNC_TARGET ||
-                   device->state.conn == C_VERIFY_S ||
-                   device->state.conn == C_VERIFY_T)
-                       drbd_syncer_progress(device, seq);
+               if (state.conn == C_SYNC_SOURCE ||
+                   state.conn == C_SYNC_TARGET ||
+                   state.conn == C_VERIFY_S ||
+                   state.conn == C_VERIFY_T)
+                       drbd_syncer_progress(device, seq, state);
 
                if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
                        lc_seq_printf_stats(seq, device->resync);
@@ -299,12 +336,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
                        put_ldev(device);
                }
 
-               if (proc_details >= 2) {
-                       if (device->resync) {
-                               lc_seq_dump_details(seq, device->resync, "rs_left",
-                                       resync_dump_detail);
-                       }
-               }
+               if (proc_details >= 2)
+                       seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
        }
        rcu_read_unlock();
 
@@ -316,7 +349,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file)
        int err;
 
        if (try_module_get(THIS_MODULE)) {
-               err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+               err = single_open(file, drbd_seq_show, NULL);
                if (err)
                        module_put(THIS_MODULE);
                return err;
index 5b17ec88ea058e766071e66eeadf3d8fca3f4940..9342b8da73ab517620dda3b38f9852e6c1219853 100644 (file)
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
                        goto fail;
        }
 
+       memset(peer_req, 0, sizeof(*peer_req));
+       INIT_LIST_HEAD(&peer_req->w.list);
        drbd_clear_interval(&peer_req->i);
        peer_req->i.size = data_size;
        peer_req->i.sector = sector;
-       peer_req->i.local = false;
-       peer_req->i.waiting = false;
-
-       peer_req->epoch = NULL;
+       peer_req->submit_jif = jiffies;
        peer_req->peer_device = peer_device;
        peer_req->pages = page;
-       atomic_set(&peer_req->pending_bios, 0);
-       peer_req->flags = 0;
        /*
         * The block_id is opaque to the receiver.  It is not endianness
         * converted, and sent back to the sender unchanged.
@@ -389,11 +386,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
                       int is_net)
 {
+       might_sleep();
        if (peer_req->flags & EE_HAS_DIGEST)
                kfree(peer_req->digest);
        drbd_free_pages(device, peer_req->pages, is_net);
        D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
        D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+       if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+               peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+               drbd_al_complete_io(device, &peer_req->i);
+       }
        mempool_free(peer_req, drbd_ee_mempool);
 }
 
@@ -791,8 +793,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
 {
        unsigned int header_size = drbd_header_size(connection);
        struct packet_info pi;
+       struct net_conf *nc;
        int err;
 
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       if (!nc) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+       sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+       rcu_read_unlock();
+
        err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
        if (err != header_size) {
                if (err >= 0)
@@ -809,7 +821,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
  * drbd_socket_okay() - Free the socket if its connection is not okay
  * @sock:      pointer to the pointer to the socket.
  */
-static int drbd_socket_okay(struct socket **sock)
+static bool drbd_socket_okay(struct socket **sock)
 {
        int rr;
        char tb[4];
@@ -827,6 +839,30 @@ static int drbd_socket_okay(struct socket **sock)
                return false;
        }
 }
+
+static bool connection_established(struct drbd_connection *connection,
+                                  struct socket **sock1,
+                                  struct socket **sock2)
+{
+       struct net_conf *nc;
+       int timeout;
+       bool ok;
+
+       if (!*sock1 || !*sock2)
+               return false;
+
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+       rcu_read_unlock();
+       schedule_timeout_interruptible(timeout);
+
+       ok = drbd_socket_okay(sock1);
+       ok = drbd_socket_okay(sock2) && ok;
+
+       return ok;
+}
+
 /* Gets called if a connection is established, or if a new minor gets created
    in a connection */
 int drbd_connected(struct drbd_peer_device *peer_device)
@@ -868,8 +904,8 @@ static int conn_connect(struct drbd_connection *connection)
        struct drbd_socket sock, msock;
        struct drbd_peer_device *peer_device;
        struct net_conf *nc;
-       int vnr, timeout, h, ok;
-       bool discard_my_data;
+       int vnr, timeout, h;
+       bool discard_my_data, ok;
        enum drbd_state_rv rv;
        struct accept_wait_data ad = {
                .connection = connection,
@@ -913,17 +949,8 @@ static int conn_connect(struct drbd_connection *connection)
                        }
                }
 
-               if (sock.socket && msock.socket) {
-                       rcu_read_lock();
-                       nc = rcu_dereference(connection->net_conf);
-                       timeout = nc->ping_timeo * HZ / 10;
-                       rcu_read_unlock();
-                       schedule_timeout_interruptible(timeout);
-                       ok = drbd_socket_okay(&sock.socket);
-                       ok = drbd_socket_okay(&msock.socket) && ok;
-                       if (ok)
-                               break;
-               }
+               if (connection_established(connection, &sock.socket, &msock.socket))
+                       break;
 
 retry:
                s = drbd_wait_for_connect(connection, &ad);
@@ -969,8 +996,7 @@ randomize:
                                goto out_release_sockets;
                }
 
-               ok = drbd_socket_okay(&sock.socket);
-               ok = drbd_socket_okay(&msock.socket) && ok;
+               ok = connection_established(connection, &sock.socket, &msock.socket);
        } while (!ok);
 
        if (ad.s_listen)
@@ -1151,7 +1177,7 @@ static void drbd_flush(struct drbd_connection *connection)
        struct drbd_peer_device *peer_device;
        int vnr;
 
-       if (connection->write_ordering >= WO_bdev_flush) {
+       if (connection->resource->write_ordering >= WO_bdev_flush) {
                rcu_read_lock();
                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
                        struct drbd_device *device = peer_device->device;
@@ -1161,14 +1187,22 @@ static void drbd_flush(struct drbd_connection *connection)
                        kref_get(&device->kref);
                        rcu_read_unlock();
 
+                       /* Right now, we have only this one synchronous code path
+                        * for flushes between request epochs.
+                        * We may want to make those asynchronous,
+                        * or at least parallelize the flushes to the volume devices.
+                        */
+                       device->flush_jif = jiffies;
+                       set_bit(FLUSH_PENDING, &device->flags);
                        rv = blkdev_issue_flush(device->ldev->backing_bdev,
                                        GFP_NOIO, NULL);
+                       clear_bit(FLUSH_PENDING, &device->flags);
                        if (rv) {
                                drbd_info(device, "local disk flush failed with status %d\n", rv);
                                /* would rather check on EOPNOTSUPP, but that is not reliable.
                                 * don't try again for ANY return value != 0
                                 * if (rv == -EOPNOTSUPP) */
-                               drbd_bump_write_ordering(connection, WO_drain_io);
+                               drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
                        }
                        put_ldev(device);
                        kref_put(&device->kref, drbd_destroy_device);
@@ -1257,15 +1291,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
        return rv;
 }
 
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+       struct disk_conf *dc;
+
+       dc = rcu_dereference(bdev->disk_conf);
+
+       if (wo == WO_bdev_flush && !dc->disk_flushes)
+               wo = WO_drain_io;
+       if (wo == WO_drain_io && !dc->disk_drain)
+               wo = WO_none;
+
+       return wo;
+}
+
 /**
  * drbd_bump_write_ordering() - Fall back to an other write ordering method
  * @connection:        DRBD connection.
  * @wo:                Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+                             enum write_ordering_e wo)
 {
-       struct disk_conf *dc;
-       struct drbd_peer_device *peer_device;
+       struct drbd_device *device;
        enum write_ordering_e pwo;
        int vnr;
        static char *write_ordering_str[] = {
@@ -1274,26 +1323,27 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
                [WO_bdev_flush] = "flush",
        };
 
-       pwo = connection->write_ordering;
-       wo = min(pwo, wo);
+       pwo = resource->write_ordering;
+       if (wo != WO_bdev_flush)
+               wo = min(pwo, wo);
        rcu_read_lock();
-       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-               struct drbd_device *device = peer_device->device;
+       idr_for_each_entry(&resource->devices, device, vnr) {
+               if (get_ldev(device)) {
+                       wo = max_allowed_wo(device->ldev, wo);
+                       if (device->ldev == bdev)
+                               bdev = NULL;
+                       put_ldev(device);
+               }
+       }
 
-               if (!get_ldev_if_state(device, D_ATTACHING))
-                       continue;
-               dc = rcu_dereference(device->ldev->disk_conf);
+       if (bdev)
+               wo = max_allowed_wo(bdev, wo);
 
-               if (wo == WO_bdev_flush && !dc->disk_flushes)
-                       wo = WO_drain_io;
-               if (wo == WO_drain_io && !dc->disk_drain)
-                       wo = WO_none;
-               put_ldev(device);
-       }
        rcu_read_unlock();
-       connection->write_ordering = wo;
-       if (pwo != connection->write_ordering || wo == WO_bdev_flush)
-               drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
+
+       resource->write_ordering = wo;
+       if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+               drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
 /**
@@ -1330,6 +1380,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
                /* wait for all pending IO completions, before we start
                 * zeroing things out. */
                conn_wait_active_ee_empty(first_peer_device(device)->connection);
+               /* add it to the active list now,
+                * so we can find it to present it in debugfs */
+               peer_req->submit_jif = jiffies;
+               peer_req->flags |= EE_SUBMITTED;
+               spin_lock_irq(&device->resource->req_lock);
+               list_add_tail(&peer_req->w.list, &device->active_ee);
+               spin_unlock_irq(&device->resource->req_lock);
                if (blkdev_issue_zeroout(device->ldev->backing_bdev,
                        sector, ds >> 9, GFP_NOIO))
                        peer_req->flags |= EE_WAS_ERROR;
@@ -1398,6 +1455,9 @@ submit:
        D_ASSERT(device, page == NULL);
 
        atomic_set(&peer_req->pending_bios, n_bios);
+       /* for debugfs: update timestamp, mark as submitted */
+       peer_req->submit_jif = jiffies;
+       peer_req->flags |= EE_SUBMITTED;
        do {
                bio = bios;
                bios = bios->bi_next;
@@ -1471,7 +1531,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
         * R_PRIMARY crashes now.
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
-       switch (connection->write_ordering) {
+       switch (connection->resource->write_ordering) {
        case WO_none:
                if (rv == FE_RECYCLED)
                        return 0;
@@ -1498,7 +1558,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 
                return 0;
        default:
-               drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+               drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+                        connection->resource->write_ordering);
                return -EIO;
        }
 
@@ -1531,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
        struct drbd_peer_request *peer_req;
        struct page *page;
        int dgs, ds, err;
-       int data_size = pi->size;
+       unsigned int data_size = pi->size;
        void *dig_in = peer_device->connection->int_dig_in;
        void *dig_vv = peer_device->connection->int_dig_vv;
        unsigned long *data;
@@ -1578,6 +1639,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
        if (!peer_req)
                return NULL;
 
+       peer_req->flags |= EE_WRITE;
        if (trim)
                return peer_req;
 
@@ -1734,9 +1796,10 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
         * respective _drbd_clear_done_ee */
 
        peer_req->w.cb = e_end_resync_block;
+       peer_req->submit_jif = jiffies;
 
        spin_lock_irq(&device->resource->req_lock);
-       list_add(&peer_req->w.list, &device->sync_ee);
+       list_add_tail(&peer_req->w.list, &device->sync_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        atomic_add(pi->size >> 9, &device->rs_sect_ev);
@@ -1889,6 +1952,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
                }
                dec_unacked(device);
        }
+
        /* we delete from the conflict detection hash _after_ we sent out the
         * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
        if (peer_req->flags & EE_IN_INTERVAL_TREE) {
@@ -2115,6 +2179,8 @@ static int handle_write_conflicts(struct drbd_device *device,
        drbd_for_each_overlap(i, &device->write_requests, sector, size) {
                if (i == &peer_req->i)
                        continue;
+               if (i->completed)
+                       continue;
 
                if (!i->local) {
                        /*
@@ -2147,7 +2213,6 @@ static int handle_write_conflicts(struct drbd_device *device,
                                          (unsigned long long)sector, size,
                                          superseded ? "local" : "remote");
 
-                       inc_unacked(device);
                        peer_req->w.cb = superseded ? e_send_superseded :
                                                   e_send_retry_write;
                        list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2206,6 +2271,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 {
        struct drbd_peer_device *peer_device;
        struct drbd_device *device;
+       struct net_conf *nc;
        sector_t sector;
        struct drbd_peer_request *peer_req;
        struct p_data *p = pi->data;
@@ -2245,6 +2311,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        }
 
        peer_req->w.cb = e_end_block;
+       peer_req->submit_jif = jiffies;
+       peer_req->flags |= EE_APPLICATION;
 
        dp_flags = be32_to_cpu(p->dp_flags);
        rw |= wire_flags_to_bio(dp_flags);
@@ -2271,9 +2339,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        spin_unlock(&connection->epoch_lock);
 
        rcu_read_lock();
-       tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+       nc = rcu_dereference(peer_device->connection->net_conf);
+       tp = nc->two_primaries;
+       if (peer_device->connection->agreed_pro_version < 100) {
+               switch (nc->wire_protocol) {
+               case DRBD_PROT_C:
+                       dp_flags |= DP_SEND_WRITE_ACK;
+                       break;
+               case DRBD_PROT_B:
+                       dp_flags |= DP_SEND_RECEIVE_ACK;
+                       break;
+               }
+       }
        rcu_read_unlock();
+
+       if (dp_flags & DP_SEND_WRITE_ACK) {
+               peer_req->flags |= EE_SEND_WRITE_ACK;
+               inc_unacked(device);
+               /* corresponding dec_unacked() in e_end_block()
+                * respective _drbd_clear_done_ee */
+       }
+
+       if (dp_flags & DP_SEND_RECEIVE_ACK) {
+               /* I really don't like it that the receiver thread
+                * sends on the msock, but anyways */
+               drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+       }
+
        if (tp) {
+               /* two primaries implies protocol C */
+               D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
                peer_req->flags |= EE_IN_INTERVAL_TREE;
                err = wait_for_and_update_peer_seq(peer_device, peer_seq);
                if (err)
@@ -2297,44 +2392,18 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
         * active_ee to become empty in drbd_submit_peer_request();
         * better not add ourselves here. */
        if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
-               list_add(&peer_req->w.list, &device->active_ee);
+               list_add_tail(&peer_req->w.list, &device->active_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        if (device->state.conn == C_SYNC_TARGET)
                wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
 
-       if (peer_device->connection->agreed_pro_version < 100) {
-               rcu_read_lock();
-               switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
-               case DRBD_PROT_C:
-                       dp_flags |= DP_SEND_WRITE_ACK;
-                       break;
-               case DRBD_PROT_B:
-                       dp_flags |= DP_SEND_RECEIVE_ACK;
-                       break;
-               }
-               rcu_read_unlock();
-       }
-
-       if (dp_flags & DP_SEND_WRITE_ACK) {
-               peer_req->flags |= EE_SEND_WRITE_ACK;
-               inc_unacked(device);
-               /* corresponding dec_unacked() in e_end_block()
-                * respective _drbd_clear_done_ee */
-       }
-
-       if (dp_flags & DP_SEND_RECEIVE_ACK) {
-               /* I really don't like it that the receiver thread
-                * sends on the msock, but anyways */
-               drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
-       }
-
        if (device->state.pdsk < D_INCONSISTENT) {
                /* In case we have the only disk of the cluster, */
                drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
-               peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
                peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-               drbd_al_begin_io(device, &peer_req->i, true);
+               drbd_al_begin_io(device, &peer_req->i);
+               peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
        }
 
        err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2347,8 +2416,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        list_del(&peer_req->w.list);
        drbd_remove_epoch_entry_interval(device, peer_req);
        spin_unlock_irq(&device->resource->req_lock);
-       if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+       if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+               peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
                drbd_al_complete_io(device, &peer_req->i);
+       }
 
 out_interrupted:
        drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2368,13 +2439,14 @@ out_interrupted:
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+               bool throttle_if_app_is_waiting)
 {
        struct lc_element *tmp;
-       bool throttle = true;
+       bool throttle = drbd_rs_c_min_rate_throttle(device);
 
-       if (!drbd_rs_c_min_rate_throttle(device))
-               return false;
+       if (!throttle || throttle_if_app_is_waiting)
+               return throttle;
 
        spin_lock_irq(&device->al_lock);
        tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2382,7 +2454,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
                struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
                if (test_bit(BME_PRIORITY, &bm_ext->flags))
                        throttle = false;
-               /* Do not slow down if app IO is already waiting for this extent */
+               /* Do not slow down if app IO is already waiting for this extent,
+                * and our progress is necessary for application IO to complete. */
        }
        spin_unlock_irq(&device->al_lock);
 
@@ -2407,7 +2480,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
        curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
                      (int)part_stat_read(&disk->part0, sectors[1]) -
                        atomic_read(&device->rs_sect_ev);
-       if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+
+       if (atomic_read(&device->ap_actlog_cnt)
+           || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
                unsigned long rs_left;
                int i;
 
@@ -2508,6 +2583,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
                peer_req->w.cb = w_e_end_data_req;
                fault_type = DRBD_FAULT_DT_RD;
                /* application IO, don't drbd_rs_begin_io */
+               peer_req->flags |= EE_APPLICATION;
                goto submit;
 
        case P_RS_DATA_REQUEST:
@@ -2538,6 +2614,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
                        peer_req->w.cb = w_e_end_csum_rs_req;
                        /* used in the sector offset progress display */
                        device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+                       /* remember to report stats in drbd_resync_finished */
+                       device->use_csums = true;
                } else if (pi->cmd == P_OV_REPLY) {
                        /* track progress, we may need to throttle */
                        atomic_add(size >> 9, &device->rs_sect_in);
@@ -2595,8 +2673,20 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
         * we would also throttle its application reads.
         * In that case, throttling is done on the SyncTarget only.
         */
-       if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+
+       /* Even though this may be a resync request, we do add to "read_ee";
+        * "sync_ee" is only used for resync WRITEs.
+        * Add to list early, so debugfs can find this request
+        * even if we have to sleep below. */
+       spin_lock_irq(&device->resource->req_lock);
+       list_add_tail(&peer_req->w.list, &device->read_ee);
+       spin_unlock_irq(&device->resource->req_lock);
+
+       update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+       if (device->state.peer != R_PRIMARY
+       && drbd_rs_should_slow_down(device, sector, false))
                schedule_timeout_uninterruptible(HZ/10);
+       update_receiver_timing_details(connection, drbd_rs_begin_io);
        if (drbd_rs_begin_io(device, sector))
                goto out_free_e;
 
@@ -2604,22 +2694,20 @@ submit_for_resync:
        atomic_add(size >> 9, &device->rs_sect_ev);
 
 submit:
+       update_receiver_timing_details(connection, drbd_submit_peer_request);
        inc_unacked(device);
-       spin_lock_irq(&device->resource->req_lock);
-       list_add_tail(&peer_req->w.list, &device->read_ee);
-       spin_unlock_irq(&device->resource->req_lock);
-
        if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
                return 0;
 
        /* don't care for the reason here */
        drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
        spin_lock_irq(&device->resource->req_lock);
        list_del(&peer_req->w.list);
        spin_unlock_irq(&device->resource->req_lock);
        /* no drbd_rs_complete_io(), we are dropping the connection anyways */
 
-out_free_e:
        put_ldev(device);
        drbd_free_peer_req(device, peer_req);
        return -EIO;
@@ -2842,8 +2930,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
 -1091   requires proto 91
 -1096   requires proto 96
  */
-static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        u64 self, peer;
        int i, j;
 
@@ -2869,7 +2959,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
                if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2892,7 +2982,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
                if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2925,7 +3015,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
                case 1: /*  self_pri && !peer_pri */ return 1;
                case 2: /* !self_pri &&  peer_pri */ return -1;
                case 3: /*  self_pri &&  peer_pri */
-                       dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+                       dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
                        return dc ? -1 : 1;
                }
        }
@@ -2938,14 +3028,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
        *rule_nr = 51;
        peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+               if (connection->agreed_pro_version < 96 ?
                    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
                    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
                    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of the peer's UUIDs. */
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
@@ -2975,14 +3065,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
        *rule_nr = 71;
        self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
        if (self == peer) {
-               if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+               if (connection->agreed_pro_version < 96 ?
                    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
                    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
                    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
                        /* The last P_SYNC_UUID did not get though. Undo the last start of
                           resync as sync source modifications of our UUIDs. */
 
-                       if (first_peer_device(device)->connection->agreed_pro_version < 91)
+                       if (connection->agreed_pro_version < 91)
                                return -1091;
 
                        __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
@@ -3352,8 +3442,7 @@ disconnect:
  * return: NULL (alg name was "")
  *         ERR_PTR(error) if something goes wrong
  *         or the crypto hash ptr, if it worked out ok. */
-static
-struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
                const char *alg, const char *name)
 {
        struct crypto_hash *tfm;
@@ -3639,7 +3728,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        struct drbd_device *device;
        struct p_sizes *p = pi->data;
        enum determine_dev_size dd = DS_UNCHANGED;
-       sector_t p_size, p_usize, my_usize;
+       sector_t p_size, p_usize, p_csize, my_usize;
        int ldsc = 0; /* local disk size changed */
        enum dds_flags ddsf;
 
@@ -3650,6 +3739,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
        p_size = be64_to_cpu(p->d_size);
        p_usize = be64_to_cpu(p->u_size);
+       p_csize = be64_to_cpu(p->c_size);
 
        /* just store the peer's disk size for now.
         * we still need to figure out whether we accept that. */
@@ -3710,7 +3800,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        }
 
        device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-       drbd_reconsider_max_bio_size(device);
        /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
           In case we cleared the QUEUE_FLAG_DISCARD from our queue in
           drbd_reconsider_max_bio_size(), we can be sure that after
@@ -3718,14 +3807,28 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
        ddsf = be16_to_cpu(p->dds_flags);
        if (get_ldev(device)) {
+               drbd_reconsider_max_bio_size(device, device->ldev);
                dd = drbd_determine_dev_size(device, ddsf, NULL);
                put_ldev(device);
                if (dd == DS_ERROR)
                        return -EIO;
                drbd_md_sync(device);
        } else {
-               /* I am diskless, need to accept the peer's size. */
-               drbd_set_my_capacity(device, p_size);
+               /*
+                * I am diskless, need to accept the peer's *current* size.
+                * I must NOT accept the peers backing disk size,
+                * it may have been larger than mine all along...
+                *
+                * At this point, the peer knows more about my disk, or at
+                * least about what we last agreed upon, than myself.
+                * So if his c_size is less than his d_size, the most likely
+                * reason is that *my* d_size was smaller last time we checked.
+                *
+                * However, if he sends a zero current size,
+                * take his (user-capped or) backing disk size anyways.
+                */
+               drbd_reconsider_max_bio_size(device, NULL);
+               drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
        }
 
        if (get_ldev(device)) {
@@ -4501,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
                struct data_cmd *cmd;
 
                drbd_thread_current_set_cpu(&connection->receiver);
+               update_receiver_timing_details(connection, drbd_recv_header);
                if (drbd_recv_header(connection, &pi))
                        goto err_out;
 
@@ -4519,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
                }
 
                if (shs) {
+                       update_receiver_timing_details(connection, drbd_recv_all_warn);
                        err = drbd_recv_all_warn(connection, pi.data, shs);
                        if (err)
                                goto err_out;
                        pi.size -= shs;
                }
 
+               update_receiver_timing_details(connection, cmd->fn);
                err = cmd->fn(connection, &pi);
                if (err) {
                        drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
index 09803d0d5207ce7fccffc5c4a3cb0229071566cd..c67717d572d16c89b1a4f74701b25c0a7ffeafbe 100644 (file)
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
        int rw = bio_data_dir(req->master_bio);
-       unsigned long duration = jiffies - req->start_time;
+       unsigned long duration = jiffies - req->start_jif;
        int cpu;
        cpu = part_stat_lock();
        part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 {
        struct drbd_request *req;
 
-       req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+       req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
        if (!req)
                return NULL;
 
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 
        INIT_LIST_HEAD(&req->tl_requests);
        INIT_LIST_HEAD(&req->w.list);
+       INIT_LIST_HEAD(&req->req_pending_master_completion);
+       INIT_LIST_HEAD(&req->req_pending_local);
 
        /* one reference to be put by __drbd_make_request */
        atomic_set(&req->completion_ref, 1);
@@ -92,6 +94,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
        return req;
 }
 
+static void drbd_remove_request_interval(struct rb_root *root,
+                                        struct drbd_request *req)
+{
+       struct drbd_device *device = req->device;
+       struct drbd_interval *i = &req->i;
+
+       drbd_remove_interval(root, i);
+
+       /* Wake up any processes waiting for this request to complete.  */
+       if (i->waiting)
+               wake_up(&device->misc_wait);
+}
+
 void drbd_req_destroy(struct kref *kref)
 {
        struct drbd_request *req = container_of(kref, struct drbd_request, kref);
@@ -107,14 +122,30 @@ void drbd_req_destroy(struct kref *kref)
                return;
        }
 
-       /* remove it from the transfer log.
-        * well, only if it had been there in the first
-        * place... if it had not (local only or conflicting
-        * and never sent), it should still be "empty" as
-        * initialized in drbd_req_new(), so we can list_del() it
-        * here unconditionally */
+       /* If called from mod_rq_state (expected normal case) or
+        * drbd_send_and_submit (the less likely normal path), this holds the
+        * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+        * though it may be still empty (never added to the transfer log).
+        *
+        * If called from do_retry(), we do NOT hold the req_lock, but we are
+        * still allowed to unconditionally list_del(&req->tl_requests),
+        * because it will be on a local on-stack list only. */
        list_del_init(&req->tl_requests);
 
+       /* finally remove the request from the conflict detection
+        * respective block_id verification interval tree. */
+       if (!drbd_interval_empty(&req->i)) {
+               struct rb_root *root;
+
+               if (s & RQ_WRITE)
+                       root = &device->write_requests;
+               else
+                       root = &device->read_requests;
+               drbd_remove_request_interval(root, req);
+       } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+               drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+                       s, (unsigned long long)req->i.sector, req->i.size);
+
        /* if it was a write, we may have to set the corresponding
         * bit(s) out-of-sync first. If it had a local part, we need to
         * release the reference to the activity log. */
@@ -188,19 +219,6 @@ void complete_master_bio(struct drbd_device *device,
 }
 
 
-static void drbd_remove_request_interval(struct rb_root *root,
-                                        struct drbd_request *req)
-{
-       struct drbd_device *device = req->device;
-       struct drbd_interval *i = &req->i;
-
-       drbd_remove_interval(root, i);
-
-       /* Wake up any processes waiting for this request to complete.  */
-       if (i->waiting)
-               wake_up(&device->misc_wait);
-}
-
 /* Helper for __req_mod().
  * Set m->bio to the master bio, if it is fit to be completed,
  * or leave it alone (it is initialized to NULL in __req_mod),
@@ -254,18 +272,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
        ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
        error = PTR_ERR(req->private_bio);
 
-       /* remove the request from the conflict detection
-        * respective block_id verification hash */
-       if (!drbd_interval_empty(&req->i)) {
-               struct rb_root *root;
-
-               if (rw == WRITE)
-                       root = &device->write_requests;
-               else
-                       root = &device->read_requests;
-               drbd_remove_request_interval(root, req);
-       }
-
        /* Before we can signal completion to the upper layers,
         * we may need to close the current transfer log epoch.
         * We are within the request lock, so we can simply compare
@@ -301,9 +307,24 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
                m->error = ok ? 0 : (error ?: -EIO);
                m->bio = req->master_bio;
                req->master_bio = NULL;
+               /* We leave it in the tree, to be able to verify later
+                * write-acks in protocol != C during resync.
+                * But we mark it as "complete", so it won't be counted as
+                * conflict in a multi-primary setup. */
+               req->i.completed = true;
        }
+
+       if (req->i.waiting)
+               wake_up(&device->misc_wait);
+
+       /* Either we are about to complete to upper layers,
+        * or we will restart this request.
+        * In either case, the request object will be destroyed soon,
+        * so better remove it from all lists. */
+       list_del_init(&req->req_pending_master_completion);
 }
 
+/* still holds resource->req_lock */
 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
 {
        struct drbd_device *device = req->device;
@@ -324,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
        return 1;
 }
 
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_next == NULL)
+               connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_next != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if (s & RQ_NET_QUEUED)
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_ack_pending == NULL)
+               connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_ack_pending != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_not_net_done == NULL)
+               connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       if (!connection)
+               return;
+       if (connection->req_not_net_done != req)
+               return;
+       list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+               const unsigned s = req->rq_state;
+               if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+                       break;
+       }
+       if (&req->tl_requests == &connection->transfer_log)
+               req = NULL;
+       connection->req_not_net_done = req;
+}
+
 /* I'd like this to be the only place that manipulates
  * req->completion_ref and req->kref. */
 static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                int clear, int set)
 {
        struct drbd_device *device = req->device;
+       struct drbd_peer_device *peer_device = first_peer_device(device);
        unsigned s = req->rq_state;
        int c_put = 0;
        int k_put = 0;
@@ -356,14 +456,23 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                atomic_inc(&req->completion_ref);
        }
 
-       if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+       if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
                atomic_inc(&req->completion_ref);
+               set_if_null_req_next(peer_device, req);
+       }
 
        if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
                kref_get(&req->kref); /* wait for the DONE */
 
-       if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
-               atomic_add(req->i.size >> 9, &device->ap_in_flight);
+       if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+               /* potentially already completed in the asender thread */
+               if (!(s & RQ_NET_DONE)) {
+                       atomic_add(req->i.size >> 9, &device->ap_in_flight);
+                       set_if_null_req_not_net_done(peer_device, req);
+               }
+               if (s & RQ_NET_PENDING)
+                       set_if_null_req_ack_pending(peer_device, req);
+       }
 
        if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
                atomic_inc(&req->completion_ref);
@@ -386,20 +495,34 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                        ++k_put;
                else
                        ++c_put;
+               list_del_init(&req->req_pending_local);
        }
 
        if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
                dec_ap_pending(device);
                ++c_put;
+               req->acked_jif = jiffies;
+               advance_conn_req_ack_pending(peer_device, req);
        }
 
-       if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+       if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
                ++c_put;
+               advance_conn_req_next(peer_device, req);
+       }
 
-       if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
-               if (req->rq_state & RQ_NET_SENT)
+       if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+               if (s & RQ_NET_SENT)
                        atomic_sub(req->i.size >> 9, &device->ap_in_flight);
-               ++k_put;
+               if (s & RQ_EXP_BARR_ACK)
+                       ++k_put;
+               req->net_done_jif = jiffies;
+
+               /* in ahead/behind mode, or just in case,
+                * before we finally destroy this request,
+                * the caching pointers must not reference it anymore */
+               advance_conn_req_next(peer_device, req);
+               advance_conn_req_ack_pending(peer_device, req);
+               advance_conn_req_not_net_done(peer_device, req);
        }
 
        /* potentially complete and destroy */
@@ -439,6 +562,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
                        bdevname(device->ldev->backing_bdev, b));
 }
 
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+       return (req->rq_state &
+                  (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+               == (RQ_WRITE|RQ_NET_PENDING);
+}
+
 /* obviously this could be coded as many single functions
  * instead of one huge switch,
  * or by putting the code directly in the respective locations
@@ -454,7 +590,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                struct bio_and_error *m)
 {
-       struct drbd_device *device = req->device;
+       struct drbd_device *const device = req->device;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        struct net_conf *nc;
        int p, rv = 0;
 
@@ -477,7 +615,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 * and from w_read_retry_remote */
                D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
                rcu_read_lock();
-               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+               nc = rcu_dereference(connection->net_conf);
                p = nc->wire_protocol;
                rcu_read_unlock();
                req->rq_state |=
@@ -549,7 +687,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
                mod_rq_state(req, m, 0, RQ_NET_QUEUED);
                req->w.cb = w_send_read_req;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -585,23 +723,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
                mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
                req->w.cb =  w_send_dblock;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
 
                /* close the epoch, in case it outgrew the limit */
                rcu_read_lock();
-               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+               nc = rcu_dereference(connection->net_conf);
                p = nc->max_epoch_size;
                rcu_read_unlock();
-               if (first_peer_device(device)->connection->current_tle_writes >= p)
-                       start_new_tl_epoch(first_peer_device(device)->connection);
+               if (connection->current_tle_writes >= p)
+                       start_new_tl_epoch(connection);
 
                break;
 
        case QUEUE_FOR_SEND_OOS:
                mod_rq_state(req, m, 0, RQ_NET_QUEUED);
                req->w.cb =  w_send_out_of_sync;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -615,18 +753,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
        case HANDED_OVER_TO_NETWORK:
                /* assert something? */
-               if (bio_data_dir(req->master_bio) == WRITE &&
-                   !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+               if (is_pending_write_protocol_A(req))
                        /* this is what is dangerous about protocol A:
                         * pretend it was successfully written on the peer. */
-                       if (req->rq_state & RQ_NET_PENDING)
-                               mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
-                       /* else: neg-ack was faster... */
-                       /* it is still not yet RQ_NET_DONE until the
-                        * corresponding epoch barrier got acked as well,
-                        * so we know what to dirty on connection loss */
-               }
-               mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+                       mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+                                               RQ_NET_SENT|RQ_NET_OK);
+               else
+                       mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+               /* It is still not yet RQ_NET_DONE until the
+                * corresponding epoch barrier got acked as well,
+                * so we know what to dirty on connection loss. */
                break;
 
        case OOS_HANDED_TO_NETWORK:
@@ -658,12 +794,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
        case WRITE_ACKED_BY_PEER_AND_SIS:
                req->rq_state |= RQ_NET_SIS;
        case WRITE_ACKED_BY_PEER:
-               D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
-               /* protocol C; successfully written on peer.
+               /* Normal operation protocol C: successfully written on peer.
+                * During resync, even in protocol != C,
+                * we requested an explicit write ack anyways.
+                * Which means we cannot even assert anything here.
                 * Nothing more to do here.
                 * We want to keep the tl in place for all protocols, to cater
                 * for volatile write-back caches on lower level devices. */
-
                goto ack_common;
        case RECV_ACKED_BY_PEER:
                D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
@@ -671,7 +808,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 * see also notes above in HANDED_OVER_TO_NETWORK about
                 * protocol != C */
        ack_common:
-               D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
                mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
                break;
 
@@ -714,7 +850,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
                get_ldev(device); /* always succeeds in this call path */
                req->w.cb = w_restart_disk_io;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &req->w);
                break;
 
@@ -736,7 +872,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
                        mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
                        if (req->w.cb) {
-                               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+                               /* w.cb expected to be w_send_dblock, or w_send_read_req */
+                               drbd_queue_work(&connection->sender_work,
                                                &req->w);
                                rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
                        } /* else: FIXME can this happen? */
@@ -769,7 +906,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                break;
 
        case QUEUE_AS_DRBD_BARRIER:
-               start_new_tl_epoch(first_peer_device(device)->connection);
+               start_new_tl_epoch(connection);
                mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
                break;
        };
@@ -886,6 +1023,9 @@ static void maybe_pull_ahead(struct drbd_device *device)
            connection->agreed_pro_version < 96)
                return;
 
+       if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+               return; /* nothing to do ... */
+
        /* If I don't even have good local storage, we can not reasonably try
         * to pull ahead of the peer. We also need the local reference to make
         * sure device->act_log is there.
@@ -1021,6 +1161,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
         * stable storage, and this is a WRITE, we may not even submit
         * this bio. */
        if (get_ldev(device)) {
+               req->pre_submit_jif = jiffies;
                if (drbd_insert_fault(device,
                                      rw == WRITE ? DRBD_FAULT_DT_WR
                                    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1035,10 +1176,14 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 
 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
 {
-       spin_lock(&device->submit.lock);
+       spin_lock_irq(&device->resource->req_lock);
        list_add_tail(&req->tl_requests, &device->submit.writes);
-       spin_unlock(&device->submit.lock);
+       list_add_tail(&req->req_pending_master_completion,
+                       &device->pending_master_completion[1 /* WRITE */]);
+       spin_unlock_irq(&device->resource->req_lock);
        queue_work(device->submit.wq, &device->submit.worker);
+       /* do_submit() may sleep internally on al_wait, too */
+       wake_up(&device->al_wait);
 }
 
 /* returns the new drbd_request pointer, if the caller is expected to
@@ -1047,7 +1192,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
  * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
  */
 static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
        const int rw = bio_data_dir(bio);
        struct drbd_request *req;
@@ -1062,7 +1207,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
                bio_endio(bio, -ENOMEM);
                return ERR_PTR(-ENOMEM);
        }
-       req->start_time = start_time;
+       req->start_jif = start_jif;
 
        if (!get_ldev(device)) {
                bio_put(req->private_bio);
@@ -1075,10 +1220,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
        if (rw == WRITE && req->private_bio && req->i.size
        && !test_bit(AL_SUSPENDED, &device->flags)) {
                if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+                       atomic_inc(&device->ap_actlog_cnt);
                        drbd_queue_write(device, req);
                        return NULL;
                }
                req->rq_state |= RQ_IN_ACT_LOG;
+               req->in_actlog_jif = jiffies;
        }
 
        return req;
@@ -1086,11 +1233,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 
 static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
 {
+       struct drbd_resource *resource = device->resource;
        const int rw = bio_rw(req->master_bio);
        struct bio_and_error m = { NULL, };
        bool no_remote = false;
+       bool submit_private_bio = false;
 
-       spin_lock_irq(&device->resource->req_lock);
+       spin_lock_irq(&resource->req_lock);
        if (rw == WRITE) {
                /* This may temporarily give up the req_lock,
                 * but will re-aquire it before it returns here.
@@ -1148,13 +1297,18 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
                        no_remote = true;
        }
 
+       /* If it took the fast path in drbd_request_prepare, add it here.
+        * The slow path has added it already. */
+       if (list_empty(&req->req_pending_master_completion))
+               list_add_tail(&req->req_pending_master_completion,
+                       &device->pending_master_completion[rw == WRITE]);
        if (req->private_bio) {
                /* needs to be marked within the same spinlock */
+               list_add_tail(&req->req_pending_local,
+                       &device->pending_completion[rw == WRITE]);
                _req_mod(req, TO_BE_SUBMITTED);
                /* but we need to give up the spinlock to submit */
-               spin_unlock_irq(&device->resource->req_lock);
-               drbd_submit_req_private_bio(req);
-               spin_lock_irq(&device->resource->req_lock);
+               submit_private_bio = true;
        } else if (no_remote) {
 nodata:
                if (__ratelimit(&drbd_ratelimit_state))
@@ -1167,15 +1321,23 @@ nodata:
 out:
        if (drbd_req_put_completion_ref(req, &m, 1))
                kref_put(&req->kref, drbd_req_destroy);
-       spin_unlock_irq(&device->resource->req_lock);
-
+       spin_unlock_irq(&resource->req_lock);
+
+       /* Even though above is a kref_put(), this is safe.
+        * As long as we still need to submit our private bio,
+        * we hold a completion ref, and the request cannot disappear.
+        * If however this request did not even have a private bio to submit
+        * (e.g. remote read), req may already be invalid now.
+        * That's why we cannot check on req->private_bio. */
+       if (submit_private_bio)
+               drbd_submit_req_private_bio(req);
        if (m.bio)
                complete_master_bio(device, &m);
 }
 
-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
-       struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+       struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
        if (IS_ERR_OR_NULL(req))
                return;
        drbd_send_and_submit(device, req);
@@ -1194,6 +1356,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
                                continue;
 
                        req->rq_state |= RQ_IN_ACT_LOG;
+                       req->in_actlog_jif = jiffies;
+                       atomic_dec(&device->ap_actlog_cnt);
                }
 
                list_del_init(&req->tl_requests);
@@ -1203,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 
 static bool prepare_al_transaction_nonblock(struct drbd_device *device,
                                            struct list_head *incoming,
-                                           struct list_head *pending)
+                                           struct list_head *pending,
+                                           struct list_head *later)
 {
        struct drbd_request *req, *tmp;
        int wake = 0;
@@ -1212,45 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
        spin_lock_irq(&device->al_lock);
        list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
                err = drbd_al_begin_io_nonblock(device, &req->i);
+               if (err == -ENOBUFS)
+                       break;
                if (err == -EBUSY)
                        wake = 1;
                if (err)
-                       continue;
-               req->rq_state |= RQ_IN_ACT_LOG;
-               list_move_tail(&req->tl_requests, pending);
+                       list_move_tail(&req->tl_requests, later);
+               else
+                       list_move_tail(&req->tl_requests, pending);
        }
        spin_unlock_irq(&device->al_lock);
        if (wake)
                wake_up(&device->al_wait);
-
        return !list_empty(pending);
 }
 
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+       struct drbd_request *req, *tmp;
+
+       list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+               req->rq_state |= RQ_IN_ACT_LOG;
+               req->in_actlog_jif = jiffies;
+               atomic_dec(&device->ap_actlog_cnt);
+               list_del_init(&req->tl_requests);
+               drbd_send_and_submit(device, req);
+       }
+}
+
 void do_submit(struct work_struct *ws)
 {
        struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
-       LIST_HEAD(incoming);
-       LIST_HEAD(pending);
-       struct drbd_request *req, *tmp;
+       LIST_HEAD(incoming);    /* from drbd_make_request() */
+       LIST_HEAD(pending);     /* to be submitted after next AL-transaction commit */
+       LIST_HEAD(busy);        /* blocked by resync requests */
+
+       /* grab new incoming requests */
+       spin_lock_irq(&device->resource->req_lock);
+       list_splice_tail_init(&device->submit.writes, &incoming);
+       spin_unlock_irq(&device->resource->req_lock);
 
        for (;;) {
-               spin_lock(&device->submit.lock);
-               list_splice_tail_init(&device->submit.writes, &incoming);
-               spin_unlock(&device->submit.lock);
+               DEFINE_WAIT(wait);
 
+               /* move used-to-be-busy back to front of incoming */
+               list_splice_init(&busy, &incoming);
                submit_fast_path(device, &incoming);
                if (list_empty(&incoming))
                        break;
 
-skip_fast_path:
-               wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
-               /* Maybe more was queued, while we prepared the transaction?
-                * Try to stuff them into this transaction as well.
-                * Be strictly non-blocking here, no wait_event, we already
-                * have something to commit.
-                * Stop if we don't make any more progres.
-                */
                for (;;) {
+                       prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+                       list_splice_init(&busy, &incoming);
+                       prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+                       if (!list_empty(&pending))
+                               break;
+
+                       schedule();
+
+                       /* If all currently "hot" activity log extents are kept busy by
+                        * incoming requests, we still must not totally starve new
+                        * requests to "cold" extents.
+                        * Something left on &incoming means there had not been
+                        * enough update slots available, and the activity log
+                        * has been marked as "starving".
+                        *
+                        * Try again now, without looking for new requests,
+                        * effectively blocking all new requests until we made
+                        * at least _some_ progress with what we currently have.
+                        */
+                       if (!list_empty(&incoming))
+                               continue;
+
+                       /* Nothing moved to pending, but nothing left
+                        * on incoming: all moved to busy!
+                        * Grab new and iterate. */
+                       spin_lock_irq(&device->resource->req_lock);
+                       list_splice_tail_init(&device->submit.writes, &incoming);
+                       spin_unlock_irq(&device->resource->req_lock);
+               }
+               finish_wait(&device->al_wait, &wait);
+
+               /* If the transaction was full, before all incoming requests
+                * had been processed, skip ahead to commit, and iterate
+                * without splicing in more incoming requests from upper layers.
+                *
+                * Else, if all incoming have been processed,
+                * they have become either "pending" (to be submitted after
+                * next transaction commit) or "busy" (blocked by resync).
+                *
+                * Maybe more was queued, while we prepared the transaction?
+                * Try to stuff those into this transaction as well.
+                * Be strictly non-blocking here,
+                * we already have something to commit.
+                *
+                * Commit if we don't make any more progres.
+                */
+
+               while (list_empty(&incoming)) {
                        LIST_HEAD(more_pending);
                        LIST_HEAD(more_incoming);
                        bool made_progress;
@@ -1260,55 +1485,32 @@ skip_fast_path:
                        if (list_empty(&device->submit.writes))
                                break;
 
-                       spin_lock(&device->submit.lock);
+                       spin_lock_irq(&device->resource->req_lock);
                        list_splice_tail_init(&device->submit.writes, &more_incoming);
-                       spin_unlock(&device->submit.lock);
+                       spin_unlock_irq(&device->resource->req_lock);
 
                        if (list_empty(&more_incoming))
                                break;
 
-                       made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+                       made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
 
                        list_splice_tail_init(&more_pending, &pending);
                        list_splice_tail_init(&more_incoming, &incoming);
-
                        if (!made_progress)
                                break;
                }
-               drbd_al_begin_io_commit(device, false);
-
-               list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
-                       list_del_init(&req->tl_requests);
-                       drbd_send_and_submit(device, req);
-               }
 
-               /* If all currently hot activity log extents are kept busy by
-                * incoming requests, we still must not totally starve new
-                * requests to cold extents. In that case, prepare one request
-                * in blocking mode. */
-               list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
-                       list_del_init(&req->tl_requests);
-                       req->rq_state |= RQ_IN_ACT_LOG;
-                       if (!drbd_al_begin_io_prepare(device, &req->i)) {
-                               /* Corresponding extent was hot after all? */
-                               drbd_send_and_submit(device, req);
-                       } else {
-                               /* Found a request to a cold extent.
-                                * Put on "pending" list,
-                                * and try to cumulate with more. */
-                               list_add(&req->tl_requests, &pending);
-                               goto skip_fast_path;
-                       }
-               }
+               drbd_al_begin_io_commit(device);
+               send_and_submit_pending(device, &pending);
        }
 }
 
 void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
        struct drbd_device *device = (struct drbd_device *) q->queuedata;
-       unsigned long start_time;
+       unsigned long start_jif;
 
-       start_time = jiffies;
+       start_jif = jiffies;
 
        /*
         * what we "blindly" assume:
@@ -1316,7 +1518,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
        D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
        inc_ap_bio(device);
-       __drbd_make_request(device, bio, start_time);
+       __drbd_make_request(device, bio, start_jif);
 }
 
 /* This is called by bio_add_page().
@@ -1353,36 +1555,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
        return limit;
 }
 
-static void find_oldest_requests(
-               struct drbd_connection *connection,
-               struct drbd_device *device,
-               struct drbd_request **oldest_req_waiting_for_peer,
-               struct drbd_request **oldest_req_waiting_for_disk)
-{
-       struct drbd_request *r;
-       *oldest_req_waiting_for_peer = NULL;
-       *oldest_req_waiting_for_disk = NULL;
-       list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-               const unsigned s = r->rq_state;
-               if (!*oldest_req_waiting_for_peer
-               && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
-                       *oldest_req_waiting_for_peer = r;
-
-               if (!*oldest_req_waiting_for_disk
-               && (s & RQ_LOCAL_PENDING) && r->device == device)
-                       *oldest_req_waiting_for_disk = r;
-
-               if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
-                       break;
-       }
-}
-
 void request_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
        struct drbd_connection *connection = first_peer_device(device)->connection;
-       struct drbd_request *req_disk, *req_peer; /* oldest request */
+       struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
        struct net_conf *nc;
+       unsigned long oldest_submit_jif;
        unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
        unsigned long now;
 
@@ -1403,14 +1582,31 @@ void request_timer_fn(unsigned long data)
                return; /* Recurring timer stopped */
 
        now = jiffies;
+       nt = now + et;
 
        spin_lock_irq(&device->resource->req_lock);
-       find_oldest_requests(connection, device, &req_peer, &req_disk);
-       if (req_peer == NULL && req_disk == NULL) {
-               spin_unlock_irq(&device->resource->req_lock);
-               mod_timer(&device->request_timer, now + et);
-               return;
-       }
+       req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+       req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+       req_peer = connection->req_not_net_done;
+       /* maybe the oldest request waiting for the peer is in fact still
+        * blocking in tcp sendmsg */
+       if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+               req_peer = connection->req_next;
+
+       /* evaluate the oldest peer request only in one timer! */
+       if (req_peer && req_peer->device != device)
+               req_peer = NULL;
+
+       /* do we have something to evaluate? */
+       if (req_peer == NULL && req_write == NULL && req_read == NULL)
+               goto out;
+
+       oldest_submit_jif =
+               (req_write && req_read)
+               ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+                 ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+               : req_write ? req_write->pre_submit_jif
+               : req_read ? req_read->pre_submit_jif : now;
 
        /* The request is considered timed out, if
         * - we have some effective timeout from the configuration,
@@ -1429,13 +1625,13 @@ void request_timer_fn(unsigned long data)
         * to expire twice (worst case) to become effective. Good enough.
         */
        if (ent && req_peer &&
-                time_after(now, req_peer->start_time + ent) &&
+                time_after(now, req_peer->pre_send_jif + ent) &&
                !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
                drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
                _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
        }
-       if (dt && req_disk &&
-                time_after(now, req_disk->start_time + dt) &&
+       if (dt && oldest_submit_jif != now &&
+                time_after(now, oldest_submit_jif + dt) &&
                !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
                drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
                __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1443,11 +1639,12 @@ void request_timer_fn(unsigned long data)
 
        /* Reschedule timer for the nearest not already expired timeout.
         * Fallback to now + min(effective network timeout, disk timeout). */
-       ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
-               ? req_peer->start_time + ent : now + et;
-       dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
-               ? req_disk->start_time + dt : now + et;
+       ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+               ? req_peer->pre_send_jif + ent : now + et;
+       dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+               ? oldest_submit_jif + dt : now + et;
        nt = time_before(ent, dt) ? ent : dt;
+out:
        spin_unlock_irq(&connection->resource->req_lock);
        mod_timer(&device->request_timer, nt);
 }
index 8566cd5866b4e2388cdb441439f25eecf6071443..9f6a04080e9f76aadfdfedf8d0e1cb408dbcba2a 100644 (file)
@@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device,
 extern void request_timer_fn(unsigned long data);
 extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
 extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
 
 /* this is in drbd_main.c */
 extern void drbd_restart_request(struct drbd_request *req);
index a5d8aae00e04c9515d4a684caaeabb079ada6dc5..c35c0f001bb74333887d0e47c23a0598cd203bfe 100644 (file)
@@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
        return rv;
 }
 
-static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
 {
        drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
            name,
@@ -952,11 +952,12 @@ enum drbd_state_rv
 __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                 enum chg_state_flags flags, struct completion *done)
 {
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        union drbd_state os;
        enum drbd_state_rv rv = SS_SUCCESS;
        enum sanitize_state_warnings ssw;
        struct after_state_chg_work *ascw;
-       bool did_remote, should_do_remote;
 
        os = drbd_read_state(device);
 
@@ -978,9 +979,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                           this happen...*/
 
                        if (is_valid_state(device, os) == rv)
-                               rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+                               rv = is_valid_soft_transition(os, ns, connection);
                } else
-                       rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+                       rv = is_valid_soft_transition(os, ns, connection);
        }
 
        if (rv < SS_SUCCESS) {
@@ -997,7 +998,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
           sanitize_state(). Only display it here if we where not called from
           _conn_request_state() */
        if (!(flags & CS_DC_SUSP))
-               conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+               conn_pr_state_change(connection, os, ns,
                                     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
 
        /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
@@ -1008,28 +1009,35 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
            (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
                atomic_inc(&device->local_cnt);
 
-       did_remote = drbd_should_do_remote(device->state);
+       if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+               clear_bit(RS_DONE, &device->flags);
+
+       /* changes to local_cnt and device flags should be visible before
+        * changes to state, which again should be visible before anything else
+        * depending on that change happens. */
+       smp_wmb();
        device->state.i = ns.i;
-       should_do_remote = drbd_should_do_remote(device->state);
        device->resource->susp = ns.susp;
        device->resource->susp_nod = ns.susp_nod;
        device->resource->susp_fen = ns.susp_fen;
+       smp_wmb();
 
        /* put replicated vs not-replicated requests in seperate epochs */
-       if (did_remote != should_do_remote)
-               start_new_tl_epoch(first_peer_device(device)->connection);
+       if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+           drbd_should_do_remote((union drbd_dev_state)ns.i))
+               start_new_tl_epoch(connection);
 
        if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
                drbd_print_uuids(device, "attached to UUIDs");
 
        /* Wake up role changes, that were delayed because of connection establishing */
        if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-           no_peer_wf_report_params(first_peer_device(device)->connection))
-               clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+           no_peer_wf_report_params(connection))
+               clear_bit(STATE_SENT, &connection->flags);
 
        wake_up(&device->misc_wait);
        wake_up(&device->state_wait);
-       wake_up(&first_peer_device(device)->connection->ping_wait);
+       wake_up(&connection->ping_wait);
 
        /* Aborted verify run, or we reached the stop sector.
         * Log the last position, unless end-of-device. */
@@ -1118,21 +1126,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
        /* Receiver should clean up itself */
        if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-               drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_stop_nowait(&connection->receiver);
 
        /* Now the receiver finished cleaning up itself, it should die */
        if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-               drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_stop_nowait(&connection->receiver);
 
        /* Upon network failure, we need to restart the receiver. */
        if (os.conn > C_WF_CONNECTION &&
            ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-               drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+               drbd_thread_restart_nowait(&connection->receiver);
 
        /* Resume AL writing if we get a connection */
        if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
                drbd_resume_al(device);
-               first_peer_device(device)->connection->connect_cnt++;
+               connection->connect_cnt++;
        }
 
        /* remember last attach time so request_timer_fn() won't
@@ -1150,7 +1158,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                ascw->w.cb = w_after_state_ch;
                ascw->device = device;
                ascw->done = done;
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+               drbd_queue_work(&connection->sender_work,
                                &ascw->w);
        } else {
                drbd_err(device, "Could not kmalloc an ascw\n");
@@ -1222,13 +1230,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                           union drbd_state ns, enum chg_state_flags flags)
 {
        struct drbd_resource *resource = device->resource;
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        struct sib_info sib;
 
        sib.sib_reason = SIB_STATE_CHANGE;
        sib.os = os;
        sib.ns = ns;
 
-       if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+       if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+       &&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
                clear_bit(CRASHED_PRIMARY, &device->flags);
                if (device->p_uuid)
                        device->p_uuid[UI_FLAGS] &= ~((u64)2);
@@ -1245,7 +1256,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
           state change. This function might sleep */
 
        if (ns.susp_nod) {
-               struct drbd_connection *connection = first_peer_device(device)->connection;
                enum drbd_req_event what = NOTHING;
 
                spin_lock_irq(&device->resource->req_lock);
@@ -1267,8 +1277,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        }
 
        if (ns.susp_fen) {
-               struct drbd_connection *connection = first_peer_device(device)->connection;
-
                spin_lock_irq(&device->resource->req_lock);
                if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
                        /* case2: The connection was established again: */
@@ -1294,8 +1302,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
         * which is unexpected. */
        if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
            (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-           first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
-               drbd_gen_and_send_sync_uuid(first_peer_device(device));
+           connection->agreed_pro_version >= 96 && get_ldev(device)) {
+               drbd_gen_and_send_sync_uuid(peer_device);
                put_ldev(device);
        }
 
@@ -1309,8 +1317,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                atomic_set(&device->rs_pending_cnt, 0);
                drbd_rs_cancel_all(device);
 
-               drbd_send_uuids(first_peer_device(device));
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_uuids(peer_device);
+               drbd_send_state(peer_device, ns);
        }
        /* No point in queuing send_bitmap if we don't have a connection
         * anymore, so check also the _current_ state, not only the new state
@@ -1335,7 +1343,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                        set_bit(NEW_CUR_UUID, &device->flags);
                                } else {
                                        drbd_uuid_new_current(device);
-                                       drbd_send_uuids(first_peer_device(device));
+                                       drbd_send_uuids(peer_device);
                                }
                        }
                        put_ldev(device);
@@ -1346,7 +1354,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
                    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
                        drbd_uuid_new_current(device);
-                       drbd_send_uuids(first_peer_device(device));
+                       drbd_send_uuids(peer_device);
                }
                /* D_DISKLESS Peer becomes secondary */
                if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1373,16 +1381,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* Last part of the attaching process ... */
        if (ns.conn >= C_CONNECTED &&
            os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-               drbd_send_sizes(first_peer_device(device), 0, 0);  /* to start sync... */
-               drbd_send_uuids(first_peer_device(device));
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+               drbd_send_uuids(peer_device);
+               drbd_send_state(peer_device, ns);
        }
 
        /* We want to pause/continue resync, tell peer. */
        if (ns.conn >= C_CONNECTED &&
             ((os.aftr_isp != ns.aftr_isp) ||
              (os.user_isp != ns.user_isp)))
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* In case one of the isp bits got set, suspend other devices. */
        if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1392,10 +1400,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* Make sure the peer gets informed about eventual state
           changes (ISP bits) while we were in WFReportParams. */
        if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* We are in the progress to start a full sync... */
        if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1449,7 +1457,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                        drbd_disk_str(device->state.disk));
 
                        if (ns.conn >= C_CONNECTED)
-                               drbd_send_state(first_peer_device(device), ns);
+                               drbd_send_state(peer_device, ns);
 
                        drbd_rs_cancel_all(device);
 
@@ -1473,7 +1481,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
                                 drbd_disk_str(device->state.disk));
 
                if (ns.conn >= C_CONNECTED)
-                       drbd_send_state(first_peer_device(device), ns);
+                       drbd_send_state(peer_device, ns);
                /* corresponding get_ldev in __drbd_set_state
                 * this may finally trigger drbd_ldev_destroy. */
                put_ldev(device);
@@ -1481,7 +1489,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
        /* Notify peer that I had a local IO error, and did not detached.. */
        if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* Disks got bigger while they were detached */
        if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1499,14 +1507,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        /* sync target done with resync.  Explicitly notify peer, even though
         * it should (at least for non-empty resyncs) already know itself. */
        if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* Verify finished, or reached stop sector.  Peer did not know about
         * the stop sector, and we may even have changed the stop sector during
         * verify to interrupt/stop early.  Send the new state. */
        if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
        && verify_can_do_stop_sector(device))
-               drbd_send_state(first_peer_device(device), ns);
+               drbd_send_state(peer_device, ns);
 
        /* This triggers bitmap writeout of potentially still unwritten pages
         * if the resync finished cleanly, or aborted because of peer disk
@@ -1563,7 +1571,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
                old_conf = connection->net_conf;
                connection->my_addr_len = 0;
                connection->peer_addr_len = 0;
-               rcu_assign_pointer(connection->net_conf, NULL);
+               RCU_INIT_POINTER(connection->net_conf, NULL);
                conn_free_crypto(connection);
                mutex_unlock(&connection->resource->conf_update);
 
@@ -1599,7 +1607,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        return 0;
 }
 
-void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
 {
        enum chg_state_flags flags = ~0;
        struct drbd_peer_device *peer_device;
@@ -1688,7 +1696,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
        return rv;
 }
 
-void
+static void
 conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
               union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
 {
index d8f57b6305cd6f84ec0be24309512fe5b25a8e92..50776b36282868415d7b48ebed66fbf1f5dec09f 100644 (file)
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
  */
 void drbd_md_io_complete(struct bio *bio, int error)
 {
-       struct drbd_md_io *md_io;
        struct drbd_device *device;
 
-       md_io = (struct drbd_md_io *)bio->bi_private;
-       device = container_of(md_io, struct drbd_device, md_io);
-
-       md_io->error = error;
+       device = bio->bi_private;
+       device->md_io.error = error;
 
        /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
         * to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
         * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
         */
        drbd_md_put_buffer(device);
-       md_io->done = 1;
+       device->md_io.done = 1;
        wake_up(&device->misc_wait);
        bio_put(bio);
        if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
@@ -135,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        i = peer_req->i;
        do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
        block_id = peer_req->block_id;
+       peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 
        spin_lock_irqsave(&device->resource->req_lock, flags);
        device->writ_cnt += peer_req->i.size >> 9;
@@ -398,9 +396,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
        if (!get_ldev(device))
                return -EIO;
 
-       if (drbd_rs_should_slow_down(device, sector))
-               goto defer;
-
        /* GFP_TRY, because if there is no memory available right now, this may
         * be rescheduled for later. It is "only" background resync, after all. */
        peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 
        peer_req->w.cb = w_e_send_csum;
        spin_lock_irq(&device->resource->req_lock);
-       list_add(&peer_req->w.list, &device->read_ee);
+       list_add_tail(&peer_req->w.list, &device->read_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
        atomic_add(size >> 9, &device->rs_sect_ev);
@@ -452,9 +447,9 @@ void resync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
 
-       if (list_empty(&device->resync_work.list))
-               drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                               &device->resync_work);
+       drbd_queue_work_if_unqueued(
+               &first_peer_device(device)->connection->sender_work,
+               &device->resync_work);
 }
 
 static void fifo_set(struct fifo_buffer *fb, int value)
@@ -504,9 +499,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
        struct disk_conf *dc;
-       unsigned int want;     /* The number of sectors we want in the proxy */
+       unsigned int want;     /* The number of sectors we want in-flight */
        int req_sect; /* Number of sectors to request in this turn */
-       int correction; /* Number of sectors more we need in the proxy*/
+       int correction; /* Number of sectors more we need in-flight */
        int cps; /* correction per invocation of drbd_rs_controller() */
        int steps; /* Number of time steps to plan ahead */
        int curr_corr;
@@ -577,20 +572,27 @@ static int drbd_rs_number_requests(struct drbd_device *device)
         * potentially causing a distributed deadlock on congestion during
         * online-verify or (checksum-based) resync, if max-buffers,
         * socket buffer sizes and resync rate settings are mis-configured. */
-       if (mxb - device->rs_in_flight < number)
-               number = mxb - device->rs_in_flight;
+
+       /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+        * mxb (as used here, and in drbd_alloc_pages on the peer) is
+        * "number of pages" (typically also 4k),
+        * but "rs_in_flight" is in "sectors" (512 Byte). */
+       if (mxb - device->rs_in_flight/8 < number)
+               number = mxb - device->rs_in_flight/8;
 
        return number;
 }
 
-static int make_resync_request(struct drbd_device *device, int cancel)
+static int make_resync_request(struct drbd_device *const device, int cancel)
 {
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
        unsigned long bit;
        sector_t sector;
        const sector_t capacity = drbd_get_capacity(device->this_bdev);
        int max_bio_size;
        int number, rollback_i, size;
-       int align, queued, sndbuf;
+       int align, requeue = 0;
        int i = 0;
 
        if (unlikely(cancel))
@@ -617,17 +619,22 @@ static int make_resync_request(struct drbd_device *device, int cancel)
                goto requeue;
 
        for (i = 0; i < number; i++) {
-               /* Stop generating RS requests, when half of the send buffer is filled */
-               mutex_lock(&first_peer_device(device)->connection->data.mutex);
-               if (first_peer_device(device)->connection->data.socket) {
-                       queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
-                       sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
-               } else {
-                       queued = 1;
-                       sndbuf = 0;
-               }
-               mutex_unlock(&first_peer_device(device)->connection->data.mutex);
-               if (queued > sndbuf / 2)
+               /* Stop generating RS requests when half of the send buffer is filled,
+                * but notify TCP that we'd like to have more space. */
+               mutex_lock(&connection->data.mutex);
+               if (connection->data.socket) {
+                       struct sock *sk = connection->data.socket->sk;
+                       int queued = sk->sk_wmem_queued;
+                       int sndbuf = sk->sk_sndbuf;
+                       if (queued > sndbuf / 2) {
+                               requeue = 1;
+                               if (sk->sk_socket)
+                                       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                       }
+               } else
+                       requeue = 1;
+               mutex_unlock(&connection->data.mutex);
+               if (requeue)
                        goto requeue;
 
 next_sector:
@@ -642,8 +649,7 @@ next_sector:
 
                sector = BM_BIT_TO_SECT(bit);
 
-               if (drbd_rs_should_slow_down(device, sector) ||
-                   drbd_try_rs_begin_io(device, sector)) {
+               if (drbd_try_rs_begin_io(device, sector)) {
                        device->bm_resync_fo = bit;
                        goto requeue;
                }
@@ -696,9 +702,9 @@ next_sector:
                /* adjust very last sectors, in case we are oddly sized */
                if (sector + (size>>9) > capacity)
                        size = (capacity-sector)<<9;
-               if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
-                   first_peer_device(device)->connection->csums_tfm) {
-                       switch (read_for_csum(first_peer_device(device), sector, size)) {
+
+               if (device->use_csums) {
+                       switch (read_for_csum(peer_device, sector, size)) {
                        case -EIO: /* Disk failure */
                                put_ldev(device);
                                return -EIO;
@@ -717,7 +723,7 @@ next_sector:
                        int err;
 
                        inc_rs_pending(device);
-                       err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+                       err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
                                                 sector, size, ID_SYNCER);
                        if (err) {
                                drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -774,8 +780,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)
 
                size = BM_BLOCK_SIZE;
 
-               if (drbd_rs_should_slow_down(device, sector) ||
-                   drbd_try_rs_begin_io(device, sector)) {
+               if (drbd_try_rs_begin_io(device, sector)) {
                        device->ov_position = sector;
                        goto requeue;
                }
@@ -911,7 +916,7 @@ int drbd_resync_finished(struct drbd_device *device)
                if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
                        khelper_cmd = "after-resync-target";
 
-               if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+               if (device->use_csums && device->rs_total) {
                        const unsigned long s = device->rs_same_csum;
                        const unsigned long t = device->rs_total;
                        const int ratio =
@@ -1351,13 +1356,15 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        /* this time, no connection->send.current_epoch_writes++;
         * If it was sent, it was the closing barrier for the last
@@ -1365,7 +1372,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
         * No more barriers will be sent, until we leave AHEAD mode again. */
        maybe_send_barrier(connection, req->epoch);
 
-       err = drbd_send_out_of_sync(first_peer_device(device), req);
+       err = drbd_send_out_of_sync(peer_device, req);
        req_mod(req, OOS_HANDED_TO_NETWORK);
 
        return err;
@@ -1380,19 +1387,21 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        re_init_if_first_write(connection, req->epoch);
        maybe_send_barrier(connection, req->epoch);
        connection->send.current_epoch_writes++;
 
-       err = drbd_send_dblock(first_peer_device(device), req);
+       err = drbd_send_dblock(peer_device, req);
        req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
        return err;
@@ -1407,19 +1416,21 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 {
        struct drbd_request *req = container_of(w, struct drbd_request, w);
        struct drbd_device *device = req->device;
-       struct drbd_connection *connection = first_peer_device(device)->connection;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
        int err;
 
        if (unlikely(cancel)) {
                req_mod(req, SEND_CANCELED);
                return 0;
        }
+       req->pre_send_jif = jiffies;
 
        /* Even read requests may close a write epoch,
         * if there was any yet. */
        maybe_send_barrier(connection, req->epoch);
 
-       err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+       err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
                                 (unsigned long)req);
 
        req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
@@ -1433,7 +1444,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
        struct drbd_device *device = req->device;
 
        if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-               drbd_al_begin_io(device, &req->i, false);
+               drbd_al_begin_io(device, &req->i);
 
        drbd_req_make_private_bio(req, req->master_bio);
        req->private_bio->bi_bdev = device->ldev->backing_bdev;
@@ -1601,26 +1612,32 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 void start_resync_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
-
-       drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-                       &device->start_resync_work);
+       drbd_device_post_work(device, RS_START);
 }
 
-int w_start_resync(struct drbd_work *w, int cancel)
+static void do_start_resync(struct drbd_device *device)
 {
-       struct drbd_device *device =
-               container_of(w, struct drbd_device, start_resync_work);
-
        if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
-               drbd_warn(device, "w_start_resync later...\n");
+               drbd_warn(device, "postponing start_resync ...\n");
                device->start_resync_timer.expires = jiffies + HZ/10;
                add_timer(&device->start_resync_timer);
-               return 0;
+               return;
        }
 
        drbd_start_resync(device, C_SYNC_SOURCE);
        clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
-       return 0;
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+       bool csums_after_crash_only;
+       rcu_read_lock();
+       csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+       rcu_read_unlock();
+       return connection->agreed_pro_version >= 89 &&          /* supported? */
+               connection->csums_tfm &&                        /* configured? */
+               (csums_after_crash_only == 0                    /* use for each resync? */
+                || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
 }
 
 /**
@@ -1633,6 +1650,8 @@ int w_start_resync(struct drbd_work *w, int cancel)
  */
 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 {
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        union drbd_state ns;
        int r;
 
@@ -1651,7 +1670,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        if (r > 0) {
                                drbd_info(device, "before-resync-target handler returned %d, "
                                         "dropping connection.\n", r);
-                               conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+                               conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
                                return;
                        }
                } else /* C_SYNC_SOURCE */ {
@@ -1664,7 +1683,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                                } else {
                                        drbd_info(device, "before-resync-source handler returned %d, "
                                                 "dropping connection.\n", r);
-                                       conn_request_state(first_peer_device(device)->connection,
+                                       conn_request_state(connection,
                                                           NS(conn, C_DISCONNECTING), CS_HARD);
                                        return;
                                }
@@ -1672,7 +1691,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                }
        }
 
-       if (current == first_peer_device(device)->connection->worker.task) {
+       if (current == connection->worker.task) {
                /* The worker should not sleep waiting for state_mutex,
                   that can take long */
                if (!mutex_trylock(device->state_mutex)) {
@@ -1733,11 +1752,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        device->rs_mark_time[i] = now;
                }
                _drbd_pause_after(device);
+               /* Forget potentially stale cached per resync extent bit-counts.
+                * Open coded drbd_rs_cancel_all(device), we already have IRQs
+                * disabled, and know the disk state is ok. */
+               spin_lock(&device->al_lock);
+               lc_reset(device->resync);
+               device->resync_locked = 0;
+               device->resync_wenr = LC_FREE;
+               spin_unlock(&device->al_lock);
        }
        write_unlock(&global_state_lock);
        spin_unlock_irq(&device->resource->req_lock);
 
        if (r == SS_SUCCESS) {
+               wake_up(&device->al_wait); /* for lc_reset() above */
                /* reset rs_last_bcast when a resync or verify is started,
                 * to deal with potential jiffies wrap. */
                device->rs_last_bcast = jiffies - HZ;
@@ -1746,8 +1774,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                     drbd_conn_str(ns.conn),
                     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
                     (unsigned long) device->rs_total);
-               if (side == C_SYNC_TARGET)
+               if (side == C_SYNC_TARGET) {
                        device->bm_resync_fo = 0;
+                       device->use_csums = use_checksum_based_resync(connection, device);
+               } else {
+                       device->use_csums = 0;
+               }
 
                /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
                 * with w_send_oos, or the sync target will get confused as to
@@ -1756,12 +1788,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                 * drbd_resync_finished from here in that case.
                 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
                 * and from after_state_ch otherwise. */
-               if (side == C_SYNC_SOURCE &&
-                   first_peer_device(device)->connection->agreed_pro_version < 96)
-                       drbd_gen_and_send_sync_uuid(first_peer_device(device));
+               if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+                       drbd_gen_and_send_sync_uuid(peer_device);
 
-               if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
-                   device->rs_total == 0) {
+               if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
                        /* This still has a race (about when exactly the peers
                         * detect connection loss) that can lead to a full sync
                         * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1777,7 +1807,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                                int timeo;
 
                                rcu_read_lock();
-                               nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+                               nc = rcu_dereference(connection->net_conf);
                                timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
                                rcu_read_unlock();
                                schedule_timeout_interruptible(timeo);
@@ -1799,10 +1829,165 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        mutex_unlock(device->state_mutex);
 }
 
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+       struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+       device->rs_last_bcast = jiffies;
+
+       if (!get_ldev(device))
+               return;
+
+       drbd_bm_write_lazy(device, 0);
+       if (resync_done && is_sync_state(device->state.conn))
+               drbd_resync_finished(device);
+
+       drbd_bcast_event(device, &sib);
+       /* update timestamp, in case it took a while to write out stuff */
+       device->rs_last_bcast = jiffies;
+       put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+       lc_destroy(device->resync);
+       device->resync = NULL;
+       lc_destroy(device->act_log);
+       device->act_log = NULL;
+       __no_warn(local,
+               drbd_free_ldev(device->ldev);
+               device->ldev = NULL;);
+       clear_bit(GOING_DISKLESS, &device->flags);
+       wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+       D_ASSERT(device, device->state.disk == D_FAILED);
+       /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+        * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+        * the protected members anymore, though, so once put_ldev reaches zero
+        * again, it will be safe to free them. */
+
+       /* Try to write changed bitmap pages, read errors may have just
+        * set some bits outside the area covered by the activity log.
+        *
+        * If we have an IO error during the bitmap writeout,
+        * we will want a full sync next time, just in case.
+        * (Do we want a specific meta data flag for this?)
+        *
+        * If that does not make it to stable storage either,
+        * we cannot do anything about that anymore.
+        *
+        * We still need to check if both bitmap and ldev are present, we may
+        * end up here after a failed attach, before ldev was even assigned.
+        */
+       if (device->bitmap && device->ldev) {
+               /* An interrupted resync or similar is allowed to recounts bits
+                * while we detach.
+                * Any modifications would not be expected anymore, though.
+                */
+               if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
+                       if (test_bit(WAS_READ_ERROR, &device->flags)) {
+                               drbd_md_set_flag(device, MDF_FULL_SYNC);
+                               drbd_md_sync(device);
+                       }
+               }
+       }
+
+       drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+       drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+       drbd_md_sync(device);
+       return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+               struct drbd_thread_timing_details *tdp,
+               unsigned int *cb_nr,
+               void *cb,
+               const char *fn, const unsigned int line)
+{
+       unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+       struct drbd_thread_timing_details *td = tdp + i;
+
+       td->start_jif = jiffies;
+       td->cb_addr = cb;
+       td->caller_fn = fn;
+       td->line = line;
+       td->cb_nr = *cb_nr;
+
+       i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+       td = tdp + i;
+       memset(td, 0, sizeof(*td));
+
+       ++(*cb_nr);
+}
+
+#define WORK_PENDING(work_bit, todo)   (todo & (1UL << work_bit))
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+       if (WORK_PENDING(MD_SYNC, todo))
+               do_md_sync(device);
+       if (WORK_PENDING(RS_DONE, todo) ||
+           WORK_PENDING(RS_PROGRESS, todo))
+               update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
+       if (WORK_PENDING(GO_DISKLESS, todo))
+               go_diskless(device);
+       if (WORK_PENDING(DESTROY_DISK, todo))
+               drbd_ldev_destroy(device);
+       if (WORK_PENDING(RS_START, todo))
+               do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK  \
+       ((1UL << GO_DISKLESS)   \
+       |(1UL << DESTROY_DISK)  \
+       |(1UL << MD_SYNC)       \
+       |(1UL << RS_START)      \
+       |(1UL << RS_PROGRESS)   \
+       |(1UL << RS_DONE)       \
+       )
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+       unsigned long old, new;
+       do {
+               old = *flags;
+               new = old & ~DRBD_DEVICE_WORK_MASK;
+       } while (cmpxchg(flags, old, new) != old);
+       return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+       struct drbd_peer_device *peer_device;
+       int vnr;
+
+       rcu_read_lock();
+       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+               struct drbd_device *device = peer_device->device;
+               unsigned long todo = get_work_bits(&device->flags);
+               if (!todo)
+                       continue;
+
+               kref_get(&device->kref);
+               rcu_read_unlock();
+               do_device_work(device, todo);
+               kref_put(&device->kref, drbd_destroy_device);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+}
+
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
        spin_lock_irq(&queue->q_lock);
-       list_splice_init(&queue->q, work_list);
+       list_splice_tail_init(&queue->q, work_list);
        spin_unlock_irq(&queue->q_lock);
        return !list_empty(work_list);
 }
@@ -1851,7 +2036,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
                /* dequeue single item only,
                 * we still use drbd_queue_work_front() in some places */
                if (!list_empty(&connection->sender_work.q))
-                       list_move(connection->sender_work.q.next, work_list);
+                       list_splice_tail_init(&connection->sender_work.q, work_list);
                spin_unlock(&connection->sender_work.q_lock);   /* FIXME get rid of this one? */
                if (!list_empty(work_list) || signal_pending(current)) {
                        spin_unlock_irq(&connection->resource->req_lock);
@@ -1873,6 +2058,14 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
                if (send_barrier)
                        maybe_send_barrier(connection,
                                        connection->send.current_epoch_nr + 1);
+
+               if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+                       break;
+
+               /* drbd_send() may have called flush_signals() */
+               if (get_t_state(&connection->worker) != RUNNING)
+                       break;
+
                schedule();
                /* may be woken up for other things but new work, too,
                 * e.g. if the current epoch got closed.
@@ -1906,10 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
        while (get_t_state(thi) == RUNNING) {
                drbd_thread_current_set_cpu(thi);
 
-               /* as long as we use drbd_queue_work_front(),
-                * we may only dequeue single work items here, not batches. */
-               if (list_empty(&work_list))
+               if (list_empty(&work_list)) {
+                       update_worker_timing_details(connection, wait_for_work);
                        wait_for_work(connection, &work_list);
+               }
+
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
 
                if (signal_pending(current)) {
                        flush_signals(current);
@@ -1926,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
                while (!list_empty(&work_list)) {
                        w = list_first_entry(&work_list, struct drbd_work, list);
                        list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
                        if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
                                continue;
                        if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -1934,13 +2133,18 @@ int drbd_worker(struct drbd_thread *thi)
        }
 
        do {
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
                while (!list_empty(&work_list)) {
                        w = list_first_entry(&work_list, struct drbd_work, list);
                        list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
                        w->cb(w, 1);
                }
                dequeue_work_batch(&connection->sender_work, &work_list);
-       } while (!list_empty(&work_list));
+       } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
 
        rcu_read_lock();
        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
index f63d358f3d933be8b2ec7a7eec0dbaec296796be..0a581400de0f4006e95de9e0f289d382608eb517 100644 (file)
 #include <linux/numa.h>
 
 #define PART_BITS 4
+#define VQ_NAME_LEN 16
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
 static struct workqueue_struct *virtblk_wq;
 
+struct virtio_blk_vq {
+       struct virtqueue *vq;
+       spinlock_t lock;
+       char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
 struct virtio_blk
 {
        struct virtio_device *vdev;
-       struct virtqueue *vq;
-       spinlock_t vq_lock;
 
        /* The disk structure for the kernel. */
        struct gendisk *disk;
@@ -47,6 +52,10 @@ struct virtio_blk
 
        /* Ida index - used to track minor number allocations. */
        int index;
+
+       /* num of vqs */
+       int num_vqs;
+       struct virtio_blk_vq *vqs;
 };
 
 struct virtblk_req
@@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq)
 {
        struct virtio_blk *vblk = vq->vdev->priv;
        bool req_done = false;
+       int qid = vq->index;
        struct virtblk_req *vbr;
        unsigned long flags;
        unsigned int len;
 
-       spin_lock_irqsave(&vblk->vq_lock, flags);
+       spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
        do {
                virtqueue_disable_cb(vq);
-               while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+               while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
                        blk_mq_complete_request(vbr->req);
                        req_done = true;
                }
@@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq)
        /* In case queue is stopped waiting for more buffers. */
        if (req_done)
                blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
-       spin_unlock_irqrestore(&vblk->vq_lock, flags);
+       spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
+       int qid = hctx->queue_num;
        const bool last = (req->cmd_flags & REQ_END) != 0;
        int err;
        bool notify = false;
@@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
                        vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
        }
 
-       spin_lock_irqsave(&vblk->vq_lock, flags);
-       err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+       spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+       err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
        if (err) {
-               virtqueue_kick(vblk->vq);
+               virtqueue_kick(vblk->vqs[qid].vq);
                blk_mq_stop_hw_queue(hctx);
-               spin_unlock_irqrestore(&vblk->vq_lock, flags);
+               spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
                /* Out of mem doesn't actually happen, since we fall back
                 * to direct descriptors */
                if (err == -ENOMEM || err == -ENOSPC)
@@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
                return BLK_MQ_RQ_QUEUE_ERROR;
        }
 
-       if (last && virtqueue_kick_prepare(vblk->vq))
+       if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
                notify = true;
-       spin_unlock_irqrestore(&vblk->vq_lock, flags);
+       spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
        if (notify)
-               virtqueue_notify(vblk->vq);
+               virtqueue_notify(vblk->vqs[qid].vq);
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
@@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 static int init_vq(struct virtio_blk *vblk)
 {
        int err = 0;
+       int i;
+       vq_callback_t **callbacks;
+       const char **names;
+       struct virtqueue **vqs;
+       unsigned short num_vqs;
+       struct virtio_device *vdev = vblk->vdev;
+
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+                                  struct virtio_blk_config, num_queues,
+                                  &num_vqs);
+       if (err)
+               num_vqs = 1;
+
+       vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
+       if (!vblk->vqs) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+       if (!names)
+               goto err_names;
+
+       callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+       if (!callbacks)
+               goto err_callbacks;
+
+       vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+       if (!vqs)
+               goto err_vqs;
 
-       /* We expect one virtqueue, for output. */
-       vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
-       if (IS_ERR(vblk->vq))
-               err = PTR_ERR(vblk->vq);
+       for (i = 0; i < num_vqs; i++) {
+               callbacks[i] = virtblk_done;
+               snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+               names[i] = vblk->vqs[i].name;
+       }
+
+       /* Discover virtqueues and write information to configuration.  */
+       err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+       if (err)
+               goto err_find_vqs;
 
+       for (i = 0; i < num_vqs; i++) {
+               spin_lock_init(&vblk->vqs[i].lock);
+               vblk->vqs[i].vq = vqs[i];
+       }
+       vblk->num_vqs = num_vqs;
+
+ err_find_vqs:
+       kfree(vqs);
+ err_vqs:
+       kfree(callbacks);
+ err_callbacks:
+       kfree(names);
+ err_names:
+       if (err)
+               kfree(vblk->vqs);
+ out:
        return err;
 }
 
@@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev)
        err = init_vq(vblk);
        if (err)
                goto out_free_vblk;
-       spin_lock_init(&vblk->vq_lock);
 
        /* FIXME: How many partitions?  How long is a piece of string? */
        vblk->disk = alloc_disk(1 << PART_BITS);
@@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
        /* Default queue sizing is to fill the ring. */
        if (!virtblk_queue_depth) {
-               virtblk_queue_depth = vblk->vq->num_free;
+               virtblk_queue_depth = vblk->vqs[0].vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
                        virtblk_queue_depth /= 2;
@@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 
        memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
        vblk->tag_set.ops = &virtio_mq_ops;
-       vblk->tag_set.nr_hw_queues = 1;
        vblk->tag_set.queue_depth = virtblk_queue_depth;
        vblk->tag_set.numa_node = NUMA_NO_NODE;
        vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev)
                sizeof(struct virtblk_req) +
                sizeof(struct scatterlist) * sg_elems;
        vblk->tag_set.driver_data = vblk;
+       vblk->tag_set.nr_hw_queues = vblk->num_vqs;
 
        err = blk_mq_alloc_tag_set(&vblk->tag_set);
        if (err)
@@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev)
        refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
        put_disk(vblk->disk);
        vdev->config->del_vqs(vdev);
+       kfree(vblk->vqs);
        kfree(vblk);
 
        /* Only free device id if we don't have any users */
@@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
        VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
        VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-       VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
+       VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+       VIRTIO_BLK_F_MQ,
 };
 
 static struct virtio_driver virtio_blk = {
index 8bc422977b5b88120ad93c08ac27cbea22cc80ec..4ff86878727fc0130ebf9be394f3f832c50c315d 100644 (file)
@@ -499,8 +499,7 @@ static int __init g5_pm72_cpufreq_init(struct device_node *cpunode)
        }
 
        /* Lookup the i2c hwclock */
-       for (hwclock = NULL;
-            (hwclock = of_find_node_by_name(hwclock, "i2c-hwclock")) != NULL;){
+       for_each_node_by_name(hwclock, "i2c-hwclock") {
                const char *loc = of_get_property(hwclock,
                                "hwctrl-location", NULL);
                if (loc == NULL)
index 544f6d327ede5512d287bcc87e28663200f873a5..061407d5952052add3424f2978267d49e949af3a 100644 (file)
@@ -936,28 +936,14 @@ static int nx842_OF_upd(struct property *new_prop)
                goto error_out;
        }
 
-       /* Set ptr to new property if provided */
-       if (new_prop) {
-               /* Single property */
-               if (!strncmp(new_prop->name, "status", new_prop->length)) {
-                       status = new_prop;
-
-               } else if (!strncmp(new_prop->name, "ibm,max-sg-len",
-                                       new_prop->length)) {
-                       maxsglen = new_prop;
-
-               } else if (!strncmp(new_prop->name, "ibm,max-sync-cop",
-                                       new_prop->length)) {
-                       maxsyncop = new_prop;
-
-               } else {
-                       /*
-                        * Skip the update, the property being updated
-                        * has no impact.
-                        */
-                       goto out;
-               }
-       }
+       /*
+        * If this is a property update, there are only certain properties that
+        * we care about. Bail if it isn't in the below list
+        */
+       if (new_prop && (strncmp(new_prop->name, "status", new_prop->length) ||
+                        strncmp(new_prop->name, "ibm,max-sg-len", new_prop->length) ||
+                        strncmp(new_prop->name, "ibm,max-sync-cop", new_prop->length)))
+               goto out;
 
        /* Perform property updates */
        ret = nx842_OF_upd_status(new_devdata, status);
index 374b57fc596d87db7e9e252c2ca4fbe57cb423f6..a12c8552f6a6bbe74791865e84299bb74e5e52bc 100644 (file)
@@ -134,8 +134,7 @@ static void cell_edac_init_csrows(struct mem_ctl_info *mci)
        int                             j;
        u32                             nr_pages;
 
-       for (np = NULL;
-            (np = of_find_node_by_name(np, "memory")) != NULL;) {
+       for_each_node_by_name(np, "memory") {
                struct resource r;
 
                /* We "know" that the Cell firmware only creates one entry
index d3d0e8cf27b4beddcfbb19f70ea8b5f1f1d5c8bb..d6c767ace9166d1c0ee54105d69f7ecd551516e8 100644 (file)
@@ -382,6 +382,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index ca8430f925643725733014cf47f98f782b017aba..e67b9a50ac7cbcda9e820117940ab294f5eb2dea 100644 (file)
@@ -1085,6 +1085,9 @@ static ssize_t store_vrm_reg(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 22e0c926989dca8dfe2644418049d56ae99ef2d9..126516414c114f309161924e49d5dbef3d60d3bb 100644 (file)
@@ -212,6 +212,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client)
                                dev_err(&client->dev,
                                        "invalid gain on %s\n",
                                        node->full_name);
+                               return -EINVAL;
                        }
                }
 
@@ -222,6 +223,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client)
                                dev_err(&client->dev,
                                        "invalid data_rate on %s\n",
                                        node->full_name);
+                               return -EINVAL;
                        }
                }
 
index f96063680e584ff5f7cbb1d6dec8147a0592e186..272fcc837ecc0ad62e3b5dc3b40aefc4c8472bec 100644 (file)
@@ -510,6 +510,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 4ae3fff13f4498dbef26281679cfc6b9dc0bfe48..bea0a344fab57b4f39b855d2997256b5d1e01b93 100644 (file)
@@ -247,8 +247,8 @@ struct dme1737_data {
        u8  pwm_acz[3];
        u8  pwm_freq[6];
        u8  pwm_rr[2];
-       u8  zone_low[3];
-       u8  zone_abs[3];
+       s8  zone_low[3];
+       s8  zone_abs[3];
        u8  zone_hyst[2];
        u32 alarms;
 };
@@ -277,7 +277,7 @@ static inline int IN_FROM_REG(int reg, int nominal, int res)
        return (reg * nominal + (3 << (res - 3))) / (3 << (res - 2));
 }
 
-static inline int IN_TO_REG(int val, int nominal)
+static inline int IN_TO_REG(long val, int nominal)
 {
        return clamp_val((val * 192 + nominal / 2) / nominal, 0, 255);
 }
@@ -293,7 +293,7 @@ static inline int TEMP_FROM_REG(int reg, int res)
        return (reg * 1000) >> (res - 8);
 }
 
-static inline int TEMP_TO_REG(int val)
+static inline int TEMP_TO_REG(long val)
 {
        return clamp_val((val < 0 ? val - 500 : val + 500) / 1000, -128, 127);
 }
@@ -308,7 +308,7 @@ static inline int TEMP_RANGE_FROM_REG(int reg)
        return TEMP_RANGE[(reg >> 4) & 0x0f];
 }
 
-static int TEMP_RANGE_TO_REG(int val, int reg)
+static int TEMP_RANGE_TO_REG(long val, int reg)
 {
        int i;
 
@@ -331,7 +331,7 @@ static inline int TEMP_HYST_FROM_REG(int reg, int ix)
        return (((ix == 1) ? reg : reg >> 4) & 0x0f) * 1000;
 }
 
-static inline int TEMP_HYST_TO_REG(int val, int ix, int reg)
+static inline int TEMP_HYST_TO_REG(long val, int ix, int reg)
 {
        int hyst = clamp_val((val + 500) / 1000, 0, 15);
 
@@ -347,7 +347,7 @@ static inline int FAN_FROM_REG(int reg, int tpc)
                return (reg == 0 || reg == 0xffff) ? 0 : 90000 * 60 / reg;
 }
 
-static inline int FAN_TO_REG(int val, int tpc)
+static inline int FAN_TO_REG(long val, int tpc)
 {
        if (tpc) {
                return clamp_val(val / tpc, 0, 0xffff);
@@ -379,7 +379,7 @@ static inline int FAN_TYPE_FROM_REG(int reg)
        return (edge > 0) ? 1 << (edge - 1) : 0;
 }
 
-static inline int FAN_TYPE_TO_REG(int val, int reg)
+static inline int FAN_TYPE_TO_REG(long val, int reg)
 {
        int edge = (val == 4) ? 3 : val;
 
@@ -402,7 +402,7 @@ static int FAN_MAX_FROM_REG(int reg)
        return 1000 + i * 500;
 }
 
-static int FAN_MAX_TO_REG(int val)
+static int FAN_MAX_TO_REG(long val)
 {
        int i;
 
@@ -460,7 +460,7 @@ static inline int PWM_ACZ_FROM_REG(int reg)
        return acz[(reg >> 5) & 0x07];
 }
 
-static inline int PWM_ACZ_TO_REG(int val, int reg)
+static inline int PWM_ACZ_TO_REG(long val, int reg)
 {
        int acz = (val == 4) ? 2 : val - 1;
 
@@ -476,7 +476,7 @@ static inline int PWM_FREQ_FROM_REG(int reg)
        return PWM_FREQ[reg & 0x0f];
 }
 
-static int PWM_FREQ_TO_REG(int val, int reg)
+static int PWM_FREQ_TO_REG(long val, int reg)
 {
        int i;
 
@@ -510,7 +510,7 @@ static inline int PWM_RR_FROM_REG(int reg, int ix)
        return (rr & 0x08) ? PWM_RR[rr & 0x07] : 0;
 }
 
-static int PWM_RR_TO_REG(int val, int ix, int reg)
+static int PWM_RR_TO_REG(long val, int ix, int reg)
 {
        int i;
 
@@ -528,7 +528,7 @@ static inline int PWM_RR_EN_FROM_REG(int reg, int ix)
        return PWM_RR_FROM_REG(reg, ix) ? 1 : 0;
 }
 
-static inline int PWM_RR_EN_TO_REG(int val, int ix, int reg)
+static inline int PWM_RR_EN_TO_REG(long val, int ix, int reg)
 {
        int en = (ix == 1) ? 0x80 : 0x08;
 
@@ -1481,13 +1481,16 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
                       const char *buf, size_t count)
 {
        struct dme1737_data *data = dev_get_drvdata(dev);
-       long val;
+       unsigned long val;
        int err;
 
-       err = kstrtol(buf, 10, &val);
+       err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index e87da902f3ae011c99a4622debf8784c5f57af0b..ada90716448d196f13cdb75251522f5877d08d99 100644 (file)
@@ -252,12 +252,12 @@ static ssize_t set_temp(struct device *dev, struct device_attribute *devattr,
        if (err < 0)
                return err;
 
-       val /= 1000;
+       val = DIV_ROUND_CLOSEST(val, 1000);
        reg = (sf == min) ? EMC6W201_REG_TEMP_LOW(nr)
                          : EMC6W201_REG_TEMP_HIGH(nr);
 
        mutex_lock(&data->update_lock);
-       data->temp[sf][nr] = clamp_val(val, -127, 128);
+       data->temp[sf][nr] = clamp_val(val, -127, 127);
        err = emc6w201_write8(client, reg, data->temp[sf][nr]);
        mutex_unlock(&data->update_lock);
 
index 0e01c4e13e3350f586d2f85e469d372a4c1af636..7b73d2002d3ef19ed2f3d9a042b51c346d2f979c 100644 (file)
@@ -238,6 +238,9 @@ static int hih6130_probe(struct i2c_client *client,
        hih6130->client = client;
        mutex_init(&hih6130->lock);
 
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_QUICK))
+               hih6130->write_length = 1;
+
        hwmon_dev = devm_hwmon_device_register_with_groups(dev, client->name,
                                                           hih6130,
                                                           hih6130_groups);
index ba1d83d480563a14c8bafae0c38df8bb6edd2740..a5e295826aeae10fdbc1e517445f953d644f8811 100644 (file)
@@ -617,6 +617,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index d2060e245ff589f206fbf57c12e62d0038cae0fa..cfaf70b9cba72951e670f16b8700dd4a5ca152cc 100644 (file)
@@ -74,12 +74,9 @@ static inline int TEMP_FROM_REG(s16 reg)
        return reg / 8 * 625 / 10;
 }
 
-static inline s16 TEMP_TO_REG(int val)
+static inline s16 TEMP_TO_REG(long val)
 {
-       if (val <= -60000)
-               return -60000 * 10 / 625 * 8;
-       if (val >= 160000)
-               return 160000 * 10 / 625 * 8;
+       val = clamp_val(val, -60000, 160000);
        return val * 10 / 625 * 8;
 }
 
@@ -206,10 +203,12 @@ static ssize_t set_temp_hyst(struct device *dev,
        if (err)
                return err;
 
+       val = clamp_val(val, -120000, 220000);
        mutex_lock(&data->update_lock);
-       data->temp[t_hyst] = TEMP_FROM_REG(data->temp[attr->index]) - val;
+        data->temp[t_hyst] =
+               TEMP_TO_REG(TEMP_FROM_REG(data->temp[attr->index]) - val);
        i2c_smbus_write_word_swapped(client, LM92_REG_TEMP_HYST,
-                                    TEMP_TO_REG(data->temp[t_hyst]));
+                                    data->temp[t_hyst]);
        mutex_unlock(&data->update_lock);
        return count;
 }
index 988181e4cfcdc8ab7e40f02ab2977b38ee231120..145f674c1d8722afecf3e125245eafd3b96c6e34 100644 (file)
@@ -615,6 +615,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index c74d2da389d960ad13b77e1a7c583381ca7efda2..e42964f07f67ddcb3cba6b8d091bb22d61488609 100644 (file)
@@ -131,13 +131,6 @@ static int tmp103_probe(struct i2c_client *client,
        struct regmap *regmap;
        int ret;
 
-       if (!i2c_check_functionality(client->adapter,
-                                    I2C_FUNC_SMBUS_BYTE_DATA)) {
-               dev_err(&client->dev,
-                       "adapter doesn't support SMBus byte transactions\n");
-               return -ENODEV;
-       }
-
        regmap = devm_regmap_init_i2c(client, &tmp103_regmap_config);
        if (IS_ERR(regmap)) {
                dev_err(dev, "failed to allocate register map\n");
index 344b22ec25533e0e39770d36aa0c87af29c815fd..3ea57c3504e24cf005c200bb5c093e1a80cef80b 100644 (file)
@@ -879,6 +879,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
 
        return count;
index c1726be3654c156ede9bdb965bb04e1d2ad6c926..2f55973a8c4c37989750905d1fd2b209b0bbd902 100644 (file)
@@ -820,6 +820,9 @@ store_vrm_reg(struct device *dev, struct device_attribute *attr, const char *buf
        err = kstrtoul(buf, 10, &val);
        if (err)
                return err;
+
+       if (val > 255)
+               return -EINVAL;
        data->vrm = val;
 
        return count;
index cb3765fec98c079e121472830bd576e37e17fab9..001df856913feba93790bf9383b2023115defad3 100644 (file)
@@ -1181,6 +1181,9 @@ static ssize_t store_vrm_reg(struct device *dev,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 9d63d71214cade1cddad0b8bfbbbbae726c4dda5..816aa6caf5d553ef2538eb9ebdcddab40f96c588 100644 (file)
@@ -353,6 +353,9 @@ store_vrm(struct device *dev, struct device_attribute *attr,
        if (err)
                return err;
 
+       if (val > 255)
+               return -EINVAL;
+
        data->vrm = val;
        return count;
 }
index 70637d23b1f92ad6688300060dadeb4f892ccff9..3612cb5b30b206a066c75d835402654cacf7d002 100644 (file)
@@ -10,7 +10,7 @@ menu "Hardware Spinlock drivers"
 
 config HWSPINLOCK_OMAP
        tristate "OMAP Hardware Spinlock device"
-       depends on ARCH_OMAP4 || SOC_OMAP5
+       depends on ARCH_OMAP4 || SOC_OMAP5 || SOC_DRA7XX || SOC_AM33XX || SOC_AM43XX
        select HWSPINLOCK
        help
          Say y here to support the OMAP Hardware Spinlock device (firstly
index 292869cc90343fd330ce29e9bcdf3c8575a2b7f7..c1e2cd4d85fe3f98034bfc70bc9ebd420cdcca30 100644 (file)
@@ -98,10 +98,29 @@ static int omap_hwspinlock_probe(struct platform_device *pdev)
        if (!io_base)
                return -ENOMEM;
 
+       /*
+        * make sure the module is enabled and clocked before reading
+        * the module SYSSTATUS register
+        */
+       pm_runtime_enable(&pdev->dev);
+       ret = pm_runtime_get_sync(&pdev->dev);
+       if (ret < 0) {
+               pm_runtime_put_noidle(&pdev->dev);
+               goto iounmap_base;
+       }
+
        /* Determine number of locks */
        i = readl(io_base + SYSSTATUS_OFFSET);
        i >>= SPINLOCK_NUMLOCKS_BIT_OFFSET;
 
+       /*
+        * runtime PM will make sure the clock of this module is
+        * enabled again iff at least one lock is requested
+        */
+       ret = pm_runtime_put(&pdev->dev);
+       if (ret < 0)
+               goto iounmap_base;
+
        /* one of the four lsb's must be set, and nothing else */
        if (hweight_long(i & 0xf) != 1 || i > 8) {
                ret = -EINVAL;
@@ -121,12 +140,6 @@ static int omap_hwspinlock_probe(struct platform_device *pdev)
        for (i = 0, hwlock = &bank->lock[0]; i < num_locks; i++, hwlock++)
                hwlock->priv = io_base + LOCK_BASE_OFFSET + sizeof(u32) * i;
 
-       /*
-        * runtime PM will make sure the clock of this module is
-        * enabled iff at least one lock is requested
-        */
-       pm_runtime_enable(&pdev->dev);
-
        ret = hwspin_lock_register(bank, &pdev->dev, &omap_hwspinlock_ops,
                                                pdata->base_id, num_locks);
        if (ret)
@@ -135,9 +148,9 @@ static int omap_hwspinlock_probe(struct platform_device *pdev)
        return 0;
 
 reg_fail:
-       pm_runtime_disable(&pdev->dev);
        kfree(bank);
 iounmap_base:
+       pm_runtime_disable(&pdev->dev);
        iounmap(io_base);
        return ret;
 }
index 2bc7f5af64f4270b3f84d866822babaca34417dd..f6d29614cb016e79e57a76c50452e16260709ca3 100644 (file)
@@ -94,14 +94,14 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
                port_priv = ib_get_agent_port(device, port_num);
 
        if (!port_priv) {
-               printk(KERN_ERR SPFX "Unable to find port agent\n");
+               dev_err(&device->dev, "Unable to find port agent\n");
                return;
        }
 
        agent = port_priv->agent[qpn];
        ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
        if (IS_ERR(ah)) {
-               printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n",
+               dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
                        PTR_ERR(ah));
                return;
        }
@@ -110,7 +110,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
                                      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
                                      GFP_KERNEL);
        if (IS_ERR(send_buf)) {
-               printk(KERN_ERR SPFX "ib_create_send_mad error\n");
+               dev_err(&device->dev, "ib_create_send_mad error\n");
                goto err1;
        }
 
@@ -125,7 +125,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
        }
 
        if (ib_post_send_mad(send_buf, NULL)) {
-               printk(KERN_ERR SPFX "ib_post_send_mad error\n");
+               dev_err(&device->dev, "ib_post_send_mad error\n");
                goto err2;
        }
        return;
@@ -151,7 +151,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
        /* Create new device info */
        port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
        if (!port_priv) {
-               printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+               dev_err(&device->dev, "No memory for ib_agent_port_private\n");
                ret = -ENOMEM;
                goto error1;
        }
@@ -161,7 +161,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
                port_priv->agent[0] = ib_register_mad_agent(device, port_num,
                                                            IB_QPT_SMI, NULL, 0,
                                                            &agent_send_handler,
-                                                           NULL, NULL);
+                                                           NULL, NULL, 0);
                if (IS_ERR(port_priv->agent[0])) {
                        ret = PTR_ERR(port_priv->agent[0]);
                        goto error2;
@@ -172,7 +172,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
        port_priv->agent[1] = ib_register_mad_agent(device, port_num,
                                                    IB_QPT_GSI, NULL, 0,
                                                    &agent_send_handler,
-                                                   NULL, NULL);
+                                                   NULL, NULL, 0);
        if (IS_ERR(port_priv->agent[1])) {
                ret = PTR_ERR(port_priv->agent[1]);
                goto error3;
@@ -202,7 +202,7 @@ int ib_agent_port_close(struct ib_device *device, int port_num)
        port_priv = __ib_get_agent_port(device, port_num);
        if (port_priv == NULL) {
                spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
-               printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+               dev_err(&device->dev, "Port %d not found\n", port_num);
                return -ENODEV;
        }
        list_del(&port_priv->port_list);
index c3239170d8b789e98233b3cb0e4ead65673de125..e28a494e2a3a0f72b41af479b269262c6472cb77 100644 (file)
@@ -3753,7 +3753,7 @@ static void cm_add_one(struct ib_device *ib_device)
        struct cm_port *port;
        struct ib_mad_reg_req reg_req = {
                .mgmt_class = IB_MGMT_CLASS_CM,
-               .mgmt_class_version = IB_CM_CLASS_VERSION
+               .mgmt_class_version = IB_CM_CLASS_VERSION,
        };
        struct ib_port_modify port_modify = {
                .set_port_cap_mask = IB_PORT_CM_SUP
@@ -3801,7 +3801,8 @@ static void cm_add_one(struct ib_device *ib_device)
                                                        0,
                                                        cm_send_handler,
                                                        cm_recv_handler,
-                                                       port);
+                                                       port,
+                                                       0);
                if (IS_ERR(port->mad_agent))
                        goto error2;
 
index 3d2e489ab732e81a2921d002229d539bfa024c18..ff9163dc159614fe76ca7c476ff80b64de9c4e54 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 
 #include <rdma/iw_cm.h>
 #include <rdma/ib_addr.h>
@@ -65,6 +66,20 @@ struct iwcm_work {
        struct list_head free_list;
 };
 
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+       {
+               .procname       = "default_backlog",
+               .data           = &default_backlog,
+               .maxlen         = sizeof(default_backlog),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       { }
+};
+
 /*
  * The following services provide a mechanism for pre-allocating iwcm_work
  * elements.  The design pre-allocates them  based on the cm_id type:
@@ -425,6 +440,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
 
        cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
+       if (!backlog)
+               backlog = default_backlog;
+
        ret = alloc_work_entries(cm_id_priv, backlog);
        if (ret)
                return ret;
@@ -1030,11 +1048,20 @@ static int __init iw_cm_init(void)
        if (!iwcm_wq)
                return -ENOMEM;
 
+       iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+                                                iwcm_ctl_table);
+       if (!iwcm_ctl_table_hdr) {
+               pr_err("iw_cm: couldn't register sysctl paths\n");
+               destroy_workqueue(iwcm_wq);
+               return -ENOMEM;
+       }
+
        return 0;
 }
 
 static void __exit iw_cm_cleanup(void)
 {
+       unregister_net_sysctl_table(iwcm_ctl_table_hdr);
        destroy_workqueue(iwcm_wq);
 }
 
index ab31f136d04b00a322a5e31f5c715deae9f09b4b..74c30f4c557e015df74ec153417e09d626f8da2e 100644 (file)
@@ -33,6 +33,9 @@
  * SOFTWARE.
  *
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -195,7 +198,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
                                           u8 rmpp_version,
                                           ib_mad_send_handler send_handler,
                                           ib_mad_recv_handler recv_handler,
-                                          void *context)
+                                          void *context,
+                                          u32 registration_flags)
 {
        struct ib_mad_port_private *port_priv;
        struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
@@ -211,68 +215,109 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 
        /* Validate parameters */
        qpn = get_spl_qp_index(qp_type);
-       if (qpn == -1)
+       if (qpn == -1) {
+               dev_notice(&device->dev,
+                          "ib_register_mad_agent: invalid QP Type %d\n",
+                          qp_type);
                goto error1;
+       }
 
-       if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
+       if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+               dev_notice(&device->dev,
+                          "ib_register_mad_agent: invalid RMPP Version %u\n",
+                          rmpp_version);
                goto error1;
+       }
 
        /* Validate MAD registration request if supplied */
        if (mad_reg_req) {
-               if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+               if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+                       dev_notice(&device->dev,
+                                  "ib_register_mad_agent: invalid Class Version %u\n",
+                                  mad_reg_req->mgmt_class_version);
                        goto error1;
-               if (!recv_handler)
+               }
+               if (!recv_handler) {
+                       dev_notice(&device->dev,
+                                  "ib_register_mad_agent: no recv_handler\n");
                        goto error1;
+               }
                if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
                        /*
                         * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
                         * one in this range currently allowed
                         */
                        if (mad_reg_req->mgmt_class !=
-                           IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+                           IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+                               dev_notice(&device->dev,
+                                          "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+                                          mad_reg_req->mgmt_class);
                                goto error1;
+                       }
                } else if (mad_reg_req->mgmt_class == 0) {
                        /*
                         * Class 0 is reserved in IBA and is used for
                         * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
                         */
+                       dev_notice(&device->dev,
+                                  "ib_register_mad_agent: Invalid Mgmt Class 0\n");
                        goto error1;
                } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
                        /*
                         * If class is in "new" vendor range,
                         * ensure supplied OUI is not zero
                         */
-                       if (!is_vendor_oui(mad_reg_req->oui))
+                       if (!is_vendor_oui(mad_reg_req->oui)) {
+                               dev_notice(&device->dev,
+                                          "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+                                          mad_reg_req->mgmt_class);
                                goto error1;
+                       }
                }
                /* Make sure class supplied is consistent with RMPP */
                if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
-                       if (rmpp_version)
+                       if (rmpp_version) {
+                               dev_notice(&device->dev,
+                                          "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+                                          mad_reg_req->mgmt_class);
                                goto error1;
+                       }
                }
+
                /* Make sure class supplied is consistent with QP type */
                if (qp_type == IB_QPT_SMI) {
                        if ((mad_reg_req->mgmt_class !=
                                        IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
                            (mad_reg_req->mgmt_class !=
-                                       IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+                                       IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+                               dev_notice(&device->dev,
+                                          "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+                                          mad_reg_req->mgmt_class);
                                goto error1;
+                       }
                } else {
                        if ((mad_reg_req->mgmt_class ==
                                        IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
                            (mad_reg_req->mgmt_class ==
-                                       IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+                                       IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+                               dev_notice(&device->dev,
+                                          "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+                                          mad_reg_req->mgmt_class);
                                goto error1;
+                       }
                }
        } else {
                /* No registration request supplied */
                if (!send_handler)
                        goto error1;
+               if (registration_flags & IB_MAD_USER_RMPP)
+                       goto error1;
        }
 
        /* Validate device and port */
        port_priv = ib_get_mad_port(device, port_num);
        if (!port_priv) {
+               dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n");
                ret = ERR_PTR(-ENODEV);
                goto error1;
        }
@@ -280,6 +325,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
        /* Verify the QP requested is supported.  For example, Ethernet devices
         * will not have QP0 */
        if (!port_priv->qp_info[qpn].qp) {
+               dev_notice(&device->dev,
+                          "ib_register_mad_agent: QP %d not supported\n", qpn);
                ret = ERR_PTR(-EPROTONOSUPPORT);
                goto error1;
        }
@@ -316,6 +363,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
        mad_agent_priv->agent.context = context;
        mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
        mad_agent_priv->agent.port_num = port_num;
+       mad_agent_priv->agent.flags = registration_flags;
        spin_lock_init(&mad_agent_priv->lock);
        INIT_LIST_HEAD(&mad_agent_priv->send_list);
        INIT_LIST_HEAD(&mad_agent_priv->wait_list);
@@ -706,7 +754,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
             smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
             IB_SMI_DISCARD) {
                ret = -EINVAL;
-               printk(KERN_ERR PFX "Invalid directed route\n");
+               dev_err(&device->dev, "Invalid directed route\n");
                goto out;
        }
 
@@ -718,7 +766,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
        local = kmalloc(sizeof *local, GFP_ATOMIC);
        if (!local) {
                ret = -ENOMEM;
-               printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+               dev_err(&device->dev, "No memory for ib_mad_local_private\n");
                goto out;
        }
        local->mad_priv = NULL;
@@ -726,7 +774,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
        mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
        if (!mad_priv) {
                ret = -ENOMEM;
-               printk(KERN_ERR PFX "No memory for local response MAD\n");
+               dev_err(&device->dev, "No memory for local response MAD\n");
                kfree(local);
                goto out;
        }
@@ -837,9 +885,9 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
        for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
                seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
                if (!seg) {
-                       printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem "
-                              "alloc failed for len %zd, gfp %#x\n",
-                              sizeof (*seg) + seg_size, gfp_mask);
+                       dev_err(&send_buf->mad_agent->device->dev,
+                               "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n",
+                               sizeof (*seg) + seg_size, gfp_mask);
                        free_send_rmpp_list(send_wr);
                        return -ENOMEM;
                }
@@ -862,6 +910,12 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
        return 0;
 }
 
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent)
+{
+       return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
 struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
                                            u32 remote_qpn, u16 pkey_index,
                                            int rmpp_active,
@@ -878,10 +932,12 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
        pad = get_pad_size(hdr_len, data_len);
        message_size = hdr_len + data_len + pad;
 
-       if ((!mad_agent->rmpp_version &&
-            (rmpp_active || message_size > sizeof(struct ib_mad))) ||
-           (!rmpp_active && message_size > sizeof(struct ib_mad)))
-               return ERR_PTR(-EINVAL);
+       if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+               if (!rmpp_active && message_size > sizeof(struct ib_mad))
+                       return ERR_PTR(-EINVAL);
+       } else
+               if (rmpp_active || message_size > sizeof(struct ib_mad))
+                       return ERR_PTR(-EINVAL);
 
        size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
        buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
@@ -1135,7 +1191,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
                              &mad_agent_priv->send_list);
                spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
-               if (mad_agent_priv->agent.rmpp_version) {
+               if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
                        ret = ib_send_rmpp_mad(mad_send_wr);
                        if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
                                ret = ib_send_mad(mad_send_wr);
@@ -1199,7 +1255,8 @@ EXPORT_SYMBOL(ib_redirect_mad_qp);
 int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
                      struct ib_wc *wc)
 {
-       printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+       dev_err(&mad_agent->device->dev,
+               "ib_process_mad_wc() not implemented yet\n");
        return 0;
 }
 EXPORT_SYMBOL(ib_process_mad_wc);
@@ -1211,7 +1268,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method,
 
        for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
                if ((*method)->agent[i]) {
-                       printk(KERN_ERR PFX "Method %d already in use\n", i);
+                       pr_err("Method %d already in use\n", i);
                        return -EINVAL;
                }
        }
@@ -1223,8 +1280,7 @@ static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
        /* Allocate management method table */
        *method = kzalloc(sizeof **method, GFP_ATOMIC);
        if (!*method) {
-               printk(KERN_ERR PFX "No memory for "
-                      "ib_mad_mgmt_method_table\n");
+               pr_err("No memory for ib_mad_mgmt_method_table\n");
                return -ENOMEM;
        }
 
@@ -1319,8 +1375,8 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                /* Allocate management class table for "new" class version */
                *class = kzalloc(sizeof **class, GFP_ATOMIC);
                if (!*class) {
-                       printk(KERN_ERR PFX "No memory for "
-                              "ib_mad_mgmt_class_table\n");
+                       dev_err(&agent_priv->agent.device->dev,
+                               "No memory for ib_mad_mgmt_class_table\n");
                        ret = -ENOMEM;
                        goto error1;
                }
@@ -1386,8 +1442,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                /* Allocate mgmt vendor class table for "new" class version */
                vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
                if (!vendor) {
-                       printk(KERN_ERR PFX "No memory for "
-                              "ib_mad_mgmt_vendor_class_table\n");
+                       dev_err(&agent_priv->agent.device->dev,
+                               "No memory for ib_mad_mgmt_vendor_class_table\n");
                        goto error1;
                }
 
@@ -1397,8 +1453,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                /* Allocate table for this management vendor class */
                vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
                if (!vendor_class) {
-                       printk(KERN_ERR PFX "No memory for "
-                              "ib_mad_mgmt_vendor_class\n");
+                       dev_err(&agent_priv->agent.device->dev,
+                               "No memory for ib_mad_mgmt_vendor_class\n");
                        goto error2;
                }
 
@@ -1429,7 +1485,7 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                        goto check_in_use;
                }
        }
-       printk(KERN_ERR PFX "All OUI slots in use\n");
+       dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
        goto error3;
 
 check_in_use:
@@ -1640,9 +1696,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
                if (mad_agent->agent.recv_handler)
                        atomic_inc(&mad_agent->refcount);
                else {
-                       printk(KERN_NOTICE PFX "No receive handler for client "
-                              "%p on port %d\n",
-                              &mad_agent->agent, port_priv->port_num);
+                       dev_notice(&port_priv->device->dev,
+                                  "No receive handler for client %p on port %d\n",
+                                  &mad_agent->agent, port_priv->port_num);
                        mad_agent = NULL;
                }
        }
@@ -1658,8 +1714,8 @@ static int validate_mad(struct ib_mad *mad, u32 qp_num)
 
        /* Make sure MAD base version is understood */
        if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
-               printk(KERN_ERR PFX "MAD received with unsupported base "
-                      "version %d\n", mad->mad_hdr.base_version);
+               pr_err("MAD received with unsupported base version %d\n",
+                       mad->mad_hdr.base_version);
                goto out;
        }
 
@@ -1685,6 +1741,7 @@ static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
 
        rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
        return !mad_agent_priv->agent.rmpp_version ||
+               !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
                !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
                                    IB_MGMT_RMPP_FLAG_ACTIVE) ||
                (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
@@ -1812,7 +1869,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
        INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
        list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
-       if (mad_agent_priv->agent.rmpp_version) {
+       if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
                mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
                                                      mad_recv_wc);
                if (!mad_recv_wc) {
@@ -1827,23 +1884,39 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
                if (!mad_send_wr) {
                        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-                       ib_free_recv_mad(mad_recv_wc);
-                       deref_mad_agent(mad_agent_priv);
-                       return;
-               }
-               ib_mark_mad_done(mad_send_wr);
-               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+                       if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+                          && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+                          && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+                                       & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+                               /* user rmpp is in effect
+                                * and this is an active RMPP MAD
+                                */
+                               mad_recv_wc->wc->wr_id = 0;
+                               mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+                                                                  mad_recv_wc);
+                               atomic_dec(&mad_agent_priv->refcount);
+                       } else {
+                               /* not user rmpp, revert to normal behavior and
+                                * drop the mad */
+                               ib_free_recv_mad(mad_recv_wc);
+                               deref_mad_agent(mad_agent_priv);
+                               return;
+                       }
+               } else {
+                       ib_mark_mad_done(mad_send_wr);
+                       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
-               /* Defined behavior is to complete response before request */
-               mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
-               mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
-                                                  mad_recv_wc);
-               atomic_dec(&mad_agent_priv->refcount);
+                       /* Defined behavior is to complete response before request */
+                       mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+                       mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+                                                          mad_recv_wc);
+                       atomic_dec(&mad_agent_priv->refcount);
 
-               mad_send_wc.status = IB_WC_SUCCESS;
-               mad_send_wc.vendor_err = 0;
-               mad_send_wc.send_buf = &mad_send_wr->send_buf;
-               ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+                       mad_send_wc.status = IB_WC_SUCCESS;
+                       mad_send_wc.vendor_err = 0;
+                       mad_send_wc.send_buf = &mad_send_wr->send_buf;
+                       ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+               }
        } else {
                mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
                                                   mad_recv_wc);
@@ -1911,8 +1984,8 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
 
        response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
        if (!response) {
-               printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
-                      "for response buffer\n");
+               dev_err(&port_priv->device->dev,
+                       "ib_mad_recv_done_handler no memory for response buffer\n");
                goto out;
        }
 
@@ -2083,7 +2156,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
 
        mad_agent_priv = mad_send_wr->mad_agent_priv;
        spin_lock_irqsave(&mad_agent_priv->lock, flags);
-       if (mad_agent_priv->agent.rmpp_version) {
+       if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
                ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
                if (ret == IB_RMPP_RESULT_CONSUMED)
                        goto done;
@@ -2176,7 +2249,8 @@ retry:
                ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
                                   &bad_send_wr);
                if (ret) {
-                       printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+                       dev_err(&port_priv->device->dev,
+                               "ib_post_send failed: %d\n", ret);
                        mad_send_wr = queued_send_wr;
                        wc->status = IB_WC_LOC_QP_OP_ERR;
                        goto retry;
@@ -2248,8 +2322,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
                                           IB_QP_STATE | IB_QP_CUR_STATE);
                        kfree(attr);
                        if (ret)
-                               printk(KERN_ERR PFX "mad_error_handler - "
-                                      "ib_modify_qp to RTS : %d\n", ret);
+                               dev_err(&port_priv->device->dev,
+                                       "mad_error_handler - ib_modify_qp to RTS : %d\n",
+                                       ret);
                        else
                                mark_sends_for_retry(qp_info);
                }
@@ -2408,7 +2483,8 @@ static void local_completions(struct work_struct *work)
                if (local->mad_priv) {
                        recv_mad_agent = local->recv_mad_agent;
                        if (!recv_mad_agent) {
-                               printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+                               dev_err(&mad_agent_priv->agent.device->dev,
+                                       "No receive MAD agent for local completion\n");
                                free_mad = 1;
                                goto local_send_completion;
                        }
@@ -2476,7 +2552,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 
        mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
 
-       if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
+       if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
                ret = ib_retry_rmpp(mad_send_wr);
                switch (ret) {
                case IB_RMPP_RESULT_UNHANDLED:
@@ -2589,7 +2665,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
                } else {
                        mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
                        if (!mad_priv) {
-                               printk(KERN_ERR PFX "No memory for receive buffer\n");
+                               dev_err(&qp_info->port_priv->device->dev,
+                                       "No memory for receive buffer\n");
                                ret = -ENOMEM;
                                break;
                        }
@@ -2625,7 +2702,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
                                              sizeof mad_priv->header,
                                            DMA_FROM_DEVICE);
                        kmem_cache_free(ib_mad_cache, mad_priv);
-                       printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+                       dev_err(&qp_info->port_priv->device->dev,
+                               "ib_post_recv failed: %d\n", ret);
                        break;
                }
        } while (post);
@@ -2681,7 +2759,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
 
        attr = kmalloc(sizeof *attr, GFP_KERNEL);
        if (!attr) {
-               printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+               dev_err(&port_priv->device->dev,
+                       "Couldn't kmalloc ib_qp_attr\n");
                return -ENOMEM;
        }
 
@@ -2705,16 +2784,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
                ret = ib_modify_qp(qp, attr, IB_QP_STATE |
                                             IB_QP_PKEY_INDEX | IB_QP_QKEY);
                if (ret) {
-                       printk(KERN_ERR PFX "Couldn't change QP%d state to "
-                              "INIT: %d\n", i, ret);
+                       dev_err(&port_priv->device->dev,
+                               "Couldn't change QP%d state to INIT: %d\n",
+                               i, ret);
                        goto out;
                }
 
                attr->qp_state = IB_QPS_RTR;
                ret = ib_modify_qp(qp, attr, IB_QP_STATE);
                if (ret) {
-                       printk(KERN_ERR PFX "Couldn't change QP%d state to "
-                              "RTR: %d\n", i, ret);
+                       dev_err(&port_priv->device->dev,
+                               "Couldn't change QP%d state to RTR: %d\n",
+                               i, ret);
                        goto out;
                }
 
@@ -2722,16 +2803,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
                attr->sq_psn = IB_MAD_SEND_Q_PSN;
                ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
                if (ret) {
-                       printk(KERN_ERR PFX "Couldn't change QP%d state to "
-                              "RTS: %d\n", i, ret);
+                       dev_err(&port_priv->device->dev,
+                               "Couldn't change QP%d state to RTS: %d\n",
+                               i, ret);
                        goto out;
                }
        }
 
        ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
        if (ret) {
-               printk(KERN_ERR PFX "Failed to request completion "
-                      "notification: %d\n", ret);
+               dev_err(&port_priv->device->dev,
+                       "Failed to request completion notification: %d\n",
+                       ret);
                goto out;
        }
 
@@ -2741,7 +2824,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
 
                ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
                if (ret) {
-                       printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+                       dev_err(&port_priv->device->dev,
+                               "Couldn't post receive WRs\n");
                        goto out;
                }
        }
@@ -2755,7 +2839,8 @@ static void qp_event_handler(struct ib_event *event, void *qp_context)
        struct ib_mad_qp_info   *qp_info = qp_context;
 
        /* It's worse than that! He's dead, Jim! */
-       printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+       dev_err(&qp_info->port_priv->device->dev,
+               "Fatal error (%d) on MAD QP (%d)\n",
                event->event, qp_info->qp->qp_num);
 }
 
@@ -2801,8 +2886,9 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info,
        qp_init_attr.event_handler = qp_event_handler;
        qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
        if (IS_ERR(qp_info->qp)) {
-               printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
-                      get_spl_qp_index(qp_type));
+               dev_err(&qp_info->port_priv->device->dev,
+                       "Couldn't create ib_mad QP%d\n",
+                       get_spl_qp_index(qp_type));
                ret = PTR_ERR(qp_info->qp);
                goto error;
        }
@@ -2840,7 +2926,7 @@ static int ib_mad_port_open(struct ib_device *device,
        /* Create new device info */
        port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
        if (!port_priv) {
-               printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+               dev_err(&device->dev, "No memory for ib_mad_port_private\n");
                return -ENOMEM;
        }
 
@@ -2860,21 +2946,21 @@ static int ib_mad_port_open(struct ib_device *device,
                                     ib_mad_thread_completion_handler,
                                     NULL, port_priv, cq_size, 0);
        if (IS_ERR(port_priv->cq)) {
-               printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+               dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
                ret = PTR_ERR(port_priv->cq);
                goto error3;
        }
 
        port_priv->pd = ib_alloc_pd(device);
        if (IS_ERR(port_priv->pd)) {
-               printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+               dev_err(&device->dev, "Couldn't create ib_mad PD\n");
                ret = PTR_ERR(port_priv->pd);
                goto error4;
        }
 
        port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
        if (IS_ERR(port_priv->mr)) {
-               printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+               dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
                ret = PTR_ERR(port_priv->mr);
                goto error5;
        }
@@ -2902,7 +2988,7 @@ static int ib_mad_port_open(struct ib_device *device,
 
        ret = ib_mad_port_start(port_priv);
        if (ret) {
-               printk(KERN_ERR PFX "Couldn't start port\n");
+               dev_err(&device->dev, "Couldn't start port\n");
                goto error9;
        }
 
@@ -2946,7 +3032,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
        port_priv = __ib_get_mad_port(device, port_num);
        if (port_priv == NULL) {
                spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-               printk(KERN_ERR PFX "Port %d not found\n", port_num);
+               dev_err(&device->dev, "Port %d not found\n", port_num);
                return -ENODEV;
        }
        list_del_init(&port_priv->port_list);
@@ -2984,14 +3070,12 @@ static void ib_mad_init_device(struct ib_device *device)
 
        for (i = start; i <= end; i++) {
                if (ib_mad_port_open(device, i)) {
-                       printk(KERN_ERR PFX "Couldn't open %s port %d\n",
-                              device->name, i);
+                       dev_err(&device->dev, "Couldn't open port %d\n", i);
                        goto error;
                }
                if (ib_agent_port_open(device, i)) {
-                       printk(KERN_ERR PFX "Couldn't open %s port %d "
-                              "for agents\n",
-                              device->name, i);
+                       dev_err(&device->dev,
+                               "Couldn't open port %d for agents\n", i);
                        goto error_agent;
                }
        }
@@ -2999,20 +3083,17 @@ static void ib_mad_init_device(struct ib_device *device)
 
 error_agent:
        if (ib_mad_port_close(device, i))
-               printk(KERN_ERR PFX "Couldn't close %s port %d\n",
-                      device->name, i);
+               dev_err(&device->dev, "Couldn't close port %d\n", i);
 
 error:
        i--;
 
        while (i >= start) {
                if (ib_agent_port_close(device, i))
-                       printk(KERN_ERR PFX "Couldn't close %s port %d "
-                              "for agents\n",
-                              device->name, i);
+                       dev_err(&device->dev,
+                               "Couldn't close port %d for agents\n", i);
                if (ib_mad_port_close(device, i))
-                       printk(KERN_ERR PFX "Couldn't close %s port %d\n",
-                              device->name, i);
+                       dev_err(&device->dev, "Couldn't close port %d\n", i);
                i--;
        }
 }
@@ -3033,12 +3114,12 @@ static void ib_mad_remove_device(struct ib_device *device)
        }
        for (i = 0; i < num_ports; i++, cur_port++) {
                if (ib_agent_port_close(device, cur_port))
-                       printk(KERN_ERR PFX "Couldn't close %s port %d "
-                              "for agents\n",
-                              device->name, cur_port);
+                       dev_err(&device->dev,
+                               "Couldn't close port %d for agents\n",
+                               cur_port);
                if (ib_mad_port_close(device, cur_port))
-                       printk(KERN_ERR PFX "Couldn't close %s port %d\n",
-                              device->name, cur_port);
+                       dev_err(&device->dev, "Couldn't close port %d\n",
+                               cur_port);
        }
 }
 
@@ -3064,7 +3145,7 @@ static int __init ib_mad_init_module(void)
                                         SLAB_HWCACHE_ALIGN,
                                         NULL);
        if (!ib_mad_cache) {
-               printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+               pr_err("Couldn't create ib_mad cache\n");
                ret = -ENOMEM;
                goto error1;
        }
@@ -3072,7 +3153,7 @@ static int __init ib_mad_init_module(void)
        INIT_LIST_HEAD(&ib_mad_port_list);
 
        if (ib_register_client(&mad_client)) {
-               printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+               pr_err("Couldn't register ib_mad client\n");
                ret = -EINVAL;
                goto error2;
        }
index 9430ab4969c55505d0cbf59ddafeda741a8892a2..d1a0b0ee9444ccc35bc063b8accc3f8330107127 100644 (file)
@@ -42,9 +42,6 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_smi.h>
 
-
-#define PFX "ib_mad: "
-
 #define IB_MAD_QPS_CORE                2 /* Always QP0 and QP1 as a minimum */
 
 /* QP and CQ parameters */
index 233eaf541f55121a3213f55f6771cda15f7e40d2..c38f030f0dc994d7e3edd6b4bd5af893c03d8214 100644 (file)
@@ -1184,7 +1184,7 @@ static void ib_sa_add_one(struct ib_device *device)
                sa_dev->port[i].agent =
                        ib_register_mad_agent(device, i + s, IB_QPT_GSI,
                                              NULL, 0, send_handler,
-                                             recv_handler, sa_dev);
+                                             recv_handler, sa_dev, 0);
                if (IS_ERR(sa_dev->port[i].agent))
                        goto err;
 
index 1acb99100556944ef499b9d3cdc26da30d2f17c5..928cdd20e2d11a1abd7c0afb2125f5e7cb6bcc6d 100644 (file)
@@ -33,6 +33,8 @@
  * SOFTWARE.
  */
 
+#define pr_fmt(fmt) "user_mad: " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
@@ -504,13 +506,15 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 
        rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
        hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
-       if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
-               copy_offset = IB_MGMT_MAD_HDR;
-               rmpp_active = 0;
-       } else {
+
+       if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+           && ib_mad_kernel_rmpp_agent(agent)) {
                copy_offset = IB_MGMT_RMPP_HDR;
                rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
-                             IB_MGMT_RMPP_FLAG_ACTIVE;
+                                               IB_MGMT_RMPP_FLAG_ACTIVE;
+       } else {
+               copy_offset = IB_MGMT_MAD_HDR;
+               rmpp_active = 0;
        }
 
        data_len = count - hdr_size(file) - hdr_len;
@@ -556,14 +560,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
                rmpp_mad->mad_hdr.tid = *tid;
        }
 
-       spin_lock_irq(&file->send_lock);
-       ret = is_duplicate(file, packet);
-       if (!ret)
+       if (!ib_mad_kernel_rmpp_agent(agent)
+          && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+          && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+               spin_lock_irq(&file->send_lock);
                list_add_tail(&packet->list, &file->send_list);
-       spin_unlock_irq(&file->send_lock);
-       if (ret) {
-               ret = -EINVAL;
-               goto err_msg;
+               spin_unlock_irq(&file->send_lock);
+       } else {
+               spin_lock_irq(&file->send_lock);
+               ret = is_duplicate(file, packet);
+               if (!ret)
+                       list_add_tail(&packet->list, &file->send_list);
+               spin_unlock_irq(&file->send_lock);
+               if (ret) {
+                       ret = -EINVAL;
+                       goto err_msg;
+               }
        }
 
        ret = ib_post_send_mad(packet->msg, NULL);
@@ -614,6 +626,8 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
        mutex_lock(&file->mutex);
 
        if (!file->port->ib_dev) {
+               dev_notice(file->port->dev,
+                          "ib_umad_reg_agent: invalid device\n");
                ret = -EPIPE;
                goto out;
        }
@@ -624,6 +638,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
        }
 
        if (ureq.qpn != 0 && ureq.qpn != 1) {
+               dev_notice(file->port->dev,
+                          "ib_umad_reg_agent: invalid QPN %d specified\n",
+                          ureq.qpn);
                ret = -EINVAL;
                goto out;
        }
@@ -632,11 +649,15 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
                if (!__get_agent(file, agent_id))
                        goto found;
 
+       dev_notice(file->port->dev,
+                  "ib_umad_reg_agent: Max Agents (%u) reached\n",
+                  IB_UMAD_MAX_AGENTS);
        ret = -ENOMEM;
        goto out;
 
 found:
        if (ureq.mgmt_class) {
+               memset(&req, 0, sizeof(req));
                req.mgmt_class         = ureq.mgmt_class;
                req.mgmt_class_version = ureq.mgmt_class_version;
                memcpy(req.oui, ureq.oui, sizeof req.oui);
@@ -657,7 +678,7 @@ found:
                                      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
                                      ureq.mgmt_class ? &req : NULL,
                                      ureq.rmpp_version,
-                                     send_handler, recv_handler, file);
+                                     send_handler, recv_handler, file, 0);
        if (IS_ERR(agent)) {
                ret = PTR_ERR(agent);
                agent = NULL;
@@ -673,10 +694,11 @@ found:
        if (!file->already_used) {
                file->already_used = 1;
                if (!file->use_pkey_index) {
-                       printk(KERN_WARNING "user_mad: process %s did not enable "
-                              "P_Key index support.\n", current->comm);
-                       printk(KERN_WARNING "user_mad:   Documentation/infiniband/user_mad.txt "
-                              "has info on the new ABI.\n");
+                       dev_warn(file->port->dev,
+                               "process %s did not enable P_Key index support.\n",
+                               current->comm);
+                       dev_warn(file->port->dev,
+                               "   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
                }
        }
 
@@ -694,6 +716,119 @@ out:
        return ret;
 }
 
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+       struct ib_user_mad_reg_req2 ureq;
+       struct ib_mad_reg_req req;
+       struct ib_mad_agent *agent = NULL;
+       int agent_id;
+       int ret;
+
+       mutex_lock(&file->port->file_mutex);
+       mutex_lock(&file->mutex);
+
+       if (!file->port->ib_dev) {
+               dev_notice(file->port->dev,
+                          "ib_umad_reg_agent2: invalid device\n");
+               ret = -EPIPE;
+               goto out;
+       }
+
+       if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+               ret = -EFAULT;
+               goto out;
+       }
+
+       if (ureq.qpn != 0 && ureq.qpn != 1) {
+               dev_notice(file->port->dev,
+                          "ib_umad_reg_agent2: invalid QPN %d specified\n",
+                          ureq.qpn);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+               dev_notice(file->port->dev,
+                          "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+                          ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+               ret = -EINVAL;
+
+               if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+                               (u32 __user *) (arg + offsetof(struct
+                               ib_user_mad_reg_req2, flags))))
+                       ret = -EFAULT;
+
+               goto out;
+       }
+
+       for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+               if (!__get_agent(file, agent_id))
+                       goto found;
+
+       dev_notice(file->port->dev,
+                  "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+                  IB_UMAD_MAX_AGENTS);
+       ret = -ENOMEM;
+       goto out;
+
+found:
+       if (ureq.mgmt_class) {
+               memset(&req, 0, sizeof(req));
+               req.mgmt_class         = ureq.mgmt_class;
+               req.mgmt_class_version = ureq.mgmt_class_version;
+               if (ureq.oui & 0xff000000) {
+                       dev_notice(file->port->dev,
+                                  "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+                                  ureq.oui);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               req.oui[2] =  ureq.oui & 0x0000ff;
+               req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+               req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+               memcpy(req.method_mask, ureq.method_mask,
+                       sizeof(req.method_mask));
+       }
+
+       agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+                                     ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+                                     ureq.mgmt_class ? &req : NULL,
+                                     ureq.rmpp_version,
+                                     send_handler, recv_handler, file,
+                                     ureq.flags);
+       if (IS_ERR(agent)) {
+               ret = PTR_ERR(agent);
+               agent = NULL;
+               goto out;
+       }
+
+       if (put_user(agent_id,
+                    (u32 __user *)(arg +
+                               offsetof(struct ib_user_mad_reg_req2, id)))) {
+               ret = -EFAULT;
+               goto out;
+       }
+
+       if (!file->already_used) {
+               file->already_used = 1;
+               file->use_pkey_index = 1;
+       }
+
+       file->agent[agent_id] = agent;
+       ret = 0;
+
+out:
+       mutex_unlock(&file->mutex);
+
+       if (ret && agent)
+               ib_unregister_mad_agent(agent);
+
+       mutex_unlock(&file->port->file_mutex);
+
+       return ret;
+}
+
+
 static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
 {
        struct ib_mad_agent *agent = NULL;
@@ -749,6 +884,8 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
                return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
        case IB_USER_MAD_ENABLE_PKEY:
                return ib_umad_enable_pkey(filp->private_data);
+       case IB_USER_MAD_REGISTER_AGENT2:
+               return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
        default:
                return -ENOIOCTLCMD;
        }
@@ -765,6 +902,8 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
                return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
        case IB_USER_MAD_ENABLE_PKEY:
                return ib_umad_enable_pkey(filp->private_data);
+       case IB_USER_MAD_REGISTER_AGENT2:
+               return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
        default:
                return -ENOIOCTLCMD;
        }
@@ -983,7 +1122,7 @@ static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 
 static dev_t overflow_maj;
 static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
-static int find_overflow_devnum(void)
+static int find_overflow_devnum(struct ib_device *device)
 {
        int ret;
 
@@ -991,7 +1130,8 @@ static int find_overflow_devnum(void)
                ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
                                          "infiniband_mad");
                if (ret) {
-                       printk(KERN_ERR "user_mad: couldn't register dynamic device number\n");
+                       dev_err(&device->dev,
+                               "couldn't register dynamic device number\n");
                        return ret;
                }
        }
@@ -1014,7 +1154,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
        devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
        if (devnum >= IB_UMAD_MAX_PORTS) {
                spin_unlock(&port_lock);
-               devnum = find_overflow_devnum();
+               devnum = find_overflow_devnum(device);
                if (devnum < 0)
                        return -1;
 
@@ -1200,14 +1340,14 @@ static int __init ib_umad_init(void)
        ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
                                     "infiniband_mad");
        if (ret) {
-               printk(KERN_ERR "user_mad: couldn't register device number\n");
+               pr_err("couldn't register device number\n");
                goto out;
        }
 
        umad_class = class_create(THIS_MODULE, "infiniband_mad");
        if (IS_ERR(umad_class)) {
                ret = PTR_ERR(umad_class);
-               printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+               pr_err("couldn't create class infiniband_mad\n");
                goto out_chrdev;
        }
 
@@ -1215,13 +1355,13 @@ static int __init ib_umad_init(void)
 
        ret = class_create_file(umad_class, &class_attr_abi_version.attr);
        if (ret) {
-               printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+               pr_err("couldn't create abi_version attribute\n");
                goto out_class;
        }
 
        ret = ib_register_client(&umad_client);
        if (ret) {
-               printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+               pr_err("couldn't register ib_umad client\n");
                goto out_class;
        }
 
index a283274a5a09fa17b0dede8f5819ef9dfaf4458b..643c08a025a52d015431b8a27be1ddcacbd36845 100644 (file)
@@ -221,6 +221,7 @@ IB_UVERBS_DECLARE_CMD(query_port);
 IB_UVERBS_DECLARE_CMD(alloc_pd);
 IB_UVERBS_DECLARE_CMD(dealloc_pd);
 IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
 IB_UVERBS_DECLARE_CMD(dereg_mr);
 IB_UVERBS_DECLARE_CMD(alloc_mw);
 IB_UVERBS_DECLARE_CMD(dealloc_mw);
index ea6203ee7bccb23279a70a67c740c4fd61a85701..0600c50e62151246e163751bd41a081b328bba38 100644 (file)
@@ -1002,6 +1002,99 @@ err_free:
        return ret;
 }
 
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+                          const char __user *buf, int in_len,
+                          int out_len)
+{
+       struct ib_uverbs_rereg_mr      cmd;
+       struct ib_uverbs_rereg_mr_resp resp;
+       struct ib_udata              udata;
+       struct ib_pd                *pd = NULL;
+       struct ib_mr                *mr;
+       struct ib_pd                *old_pd;
+       int                          ret;
+       struct ib_uobject           *uobj;
+
+       if (out_len < sizeof(resp))
+               return -ENOSPC;
+
+       if (copy_from_user(&cmd, buf, sizeof(cmd)))
+               return -EFAULT;
+
+       INIT_UDATA(&udata, buf + sizeof(cmd),
+                  (unsigned long) cmd.response + sizeof(resp),
+                  in_len - sizeof(cmd), out_len - sizeof(resp));
+
+       if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+               return -EINVAL;
+
+       if ((cmd.flags & IB_MR_REREG_TRANS) &&
+           (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+            (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+                       return -EINVAL;
+
+       uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+                             file->ucontext);
+
+       if (!uobj)
+               return -EINVAL;
+
+       mr = uobj->object;
+
+       if (cmd.flags & IB_MR_REREG_ACCESS) {
+               ret = ib_check_mr_access(cmd.access_flags);
+               if (ret)
+                       goto put_uobjs;
+       }
+
+       if (cmd.flags & IB_MR_REREG_PD) {
+               pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+               if (!pd) {
+                       ret = -EINVAL;
+                       goto put_uobjs;
+               }
+       }
+
+       if (atomic_read(&mr->usecnt)) {
+               ret = -EBUSY;
+               goto put_uobj_pd;
+       }
+
+       old_pd = mr->pd;
+       ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+                                       cmd.length, cmd.hca_va,
+                                       cmd.access_flags, pd, &udata);
+       if (!ret) {
+               if (cmd.flags & IB_MR_REREG_PD) {
+                       atomic_inc(&pd->usecnt);
+                       mr->pd = pd;
+                       atomic_dec(&old_pd->usecnt);
+               }
+       } else {
+               goto put_uobj_pd;
+       }
+
+       memset(&resp, 0, sizeof(resp));
+       resp.lkey      = mr->lkey;
+       resp.rkey      = mr->rkey;
+
+       if (copy_to_user((void __user *)(unsigned long)cmd.response,
+                        &resp, sizeof(resp)))
+               ret = -EFAULT;
+       else
+               ret = in_len;
+
+put_uobj_pd:
+       if (cmd.flags & IB_MR_REREG_PD)
+               put_pd_read(pd);
+
+put_uobjs:
+
+       put_uobj_write(mr->uobject);
+
+       return ret;
+}
+
 ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
                           const char __user *buf, int in_len,
                           int out_len)
index 08219fb3338b0652f350da1af864b4cfb355ff3c..c73b22a257fe3c92c9398e5318acbc4ffa569b90 100644 (file)
@@ -87,6 +87,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
        [IB_USER_VERBS_CMD_ALLOC_PD]            = ib_uverbs_alloc_pd,
        [IB_USER_VERBS_CMD_DEALLOC_PD]          = ib_uverbs_dealloc_pd,
        [IB_USER_VERBS_CMD_REG_MR]              = ib_uverbs_reg_mr,
+       [IB_USER_VERBS_CMD_REREG_MR]            = ib_uverbs_rereg_mr,
        [IB_USER_VERBS_CMD_DEREG_MR]            = ib_uverbs_dereg_mr,
        [IB_USER_VERBS_CMD_ALLOC_MW]            = ib_uverbs_alloc_mw,
        [IB_USER_VERBS_CMD_DEALLOC_MW]          = ib_uverbs_dealloc_mw,
index 49e0e8533f748d8dc3bc8ccec234a22b0bfc242c..1b63185b4ad4fa977bc4f211fe159cd32c83aabb 100644 (file)
@@ -260,11 +260,14 @@ static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
                          mq->msg_pool.host, dma_unmap_addr(mq, mapping));
 }
 
-static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, int q_size,
-                          int msg_size)
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+                          size_t q_size, size_t msg_size)
 {
        u8 *pool_start;
 
+       if (q_size > SIZE_MAX / msg_size)
+               return -EINVAL;
+
        pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
                                        &mq->host_dma, GFP_KERNEL);
        if (!pool_start)
index fbe6051af254bac612de62dd36e1fa66d3150396..c9df0549f51dc0921eede052f8cf0753ce83e0df 100644 (file)
@@ -227,6 +227,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
 
        chp = get_chp(dev, qid);
        if (chp) {
+               t4_clear_cq_armed(&chp->cq);
                spin_lock_irqsave(&chp->comp_handler_lock, flag);
                (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
                spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
index c158fcc02bca2b252d8d78b71599697fee361438..41cd6882b648128f06476e948c56b94792e807bb 100644 (file)
@@ -1105,7 +1105,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
                       struct c4iw_cq *schp)
 {
        int count;
-       int flushed;
+       int rq_flushed, sq_flushed;
        unsigned long flag;
 
        PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
@@ -1123,27 +1123,40 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
 
        c4iw_flush_hw_cq(rchp);
        c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
-       flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+       rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
        spin_unlock(&qhp->lock);
        spin_unlock_irqrestore(&rchp->lock, flag);
-       if (flushed) {
-               spin_lock_irqsave(&rchp->comp_handler_lock, flag);
-               (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
-               spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
-       }
 
        /* locking hierarchy: cq lock first, then qp lock. */
        spin_lock_irqsave(&schp->lock, flag);
        spin_lock(&qhp->lock);
        if (schp != rchp)
                c4iw_flush_hw_cq(schp);
-       flushed = c4iw_flush_sq(qhp);
+       sq_flushed = c4iw_flush_sq(qhp);
        spin_unlock(&qhp->lock);
        spin_unlock_irqrestore(&schp->lock, flag);
-       if (flushed) {
-               spin_lock_irqsave(&schp->comp_handler_lock, flag);
-               (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
-               spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+
+       if (schp == rchp) {
+               if (t4_clear_cq_armed(&rchp->cq) &&
+                   (rq_flushed || sq_flushed)) {
+                       spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+                       (*rchp->ibcq.comp_handler)(&rchp->ibcq,
+                                                  rchp->ibcq.cq_context);
+                       spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+               }
+       } else {
+               if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) {
+                       spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+                       (*rchp->ibcq.comp_handler)(&rchp->ibcq,
+                                                  rchp->ibcq.cq_context);
+                       spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+               }
+               if (t4_clear_cq_armed(&schp->cq) && sq_flushed) {
+                       spin_lock_irqsave(&schp->comp_handler_lock, flag);
+                       (*schp->ibcq.comp_handler)(&schp->ibcq,
+                                                  schp->ibcq.cq_context);
+                       spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+               }
        }
 }
 
index df5edfa31a8fe118b950d500fc4bed3c76da5d47..c04e5134b30cb27055740f572403b25cd4c1dc6a 100644 (file)
@@ -524,6 +524,10 @@ static inline int t4_wq_db_enabled(struct t4_wq *wq)
        return !wq->rq.queue[wq->rq.size].status.db_off;
 }
 
+enum t4_cq_flags {
+       CQ_ARMED        = 1,
+};
+
 struct t4_cq {
        struct t4_cqe *queue;
        dma_addr_t dma_addr;
@@ -544,12 +548,19 @@ struct t4_cq {
        u16 cidx_inc;
        u8 gen;
        u8 error;
+       unsigned long flags;
 };
 
+static inline int t4_clear_cq_armed(struct t4_cq *cq)
+{
+       return test_and_clear_bit(CQ_ARMED, &cq->flags);
+}
+
 static inline int t4_arm_cq(struct t4_cq *cq, int se)
 {
        u32 val;
 
+       set_bit(CQ_ARMED, &cq->flags);
        while (cq->cidx_inc > CIDXINC_MASK) {
                val = SEINTARM(0) | CIDXINC(CIDXINC_MASK) | TIMERREG(7) |
                      INGRESSQID(cq->cqid);
index 43f2d0424d4fb60e95c17a8d96cdd69ba6767f07..e890e5ba0e011b550d98442c192477fea19d4a34 100644 (file)
@@ -726,7 +726,7 @@ bail:
  * @dd: the infinipath device
  * @pkeys: the PKEY table
  */
-static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
 {
        struct ipath_portdata *pd;
        int i;
@@ -759,6 +759,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
        }
        if (changed) {
                u64 pkey;
+               struct ib_event event;
 
                pkey = (u64) dd->ipath_pkeys[0] |
                        ((u64) dd->ipath_pkeys[1] << 16) |
@@ -768,12 +769,17 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys)
                           (unsigned long long) pkey);
                ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
                                 pkey);
+
+               event.event = IB_EVENT_PKEY_CHANGE;
+               event.device = &dd->verbs_dev->ibdev;
+               event.element.port_num = port;
+               ib_dispatch_event(&event);
        }
        return 0;
 }
 
 static int recv_subn_set_pkeytable(struct ib_smp *smp,
-                                  struct ib_device *ibdev)
+                                  struct ib_device *ibdev, u8 port)
 {
        u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
        __be16 *p = (__be16 *) smp->data;
@@ -784,7 +790,7 @@ static int recv_subn_set_pkeytable(struct ib_smp *smp,
        for (i = 0; i < n; i++)
                q[i] = be16_to_cpu(p[i]);
 
-       if (startpx != 0 || set_pkeys(dev->dd, q) != 0)
+       if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
                smp->status |= IB_SMP_INVALID_FIELD;
 
        return recv_subn_get_pkeytable(smp, ibdev);
@@ -1342,7 +1348,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
                        ret = recv_subn_set_portinfo(smp, ibdev, port_num);
                        goto bail;
                case IB_SMP_ATTR_PKEY_TABLE:
-                       ret = recv_subn_set_pkeytable(smp, ibdev);
+                       ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
                        goto bail;
                case IB_SMP_ATTR_SM_INFO:
                        if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
index 287ad0564acdfa5444c1a05d8dec2d9edd91416e..82a7dd87089b66efa0e925116b4d72053c341a3f 100644 (file)
@@ -891,7 +891,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
                                agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
                                                              q ? IB_QPT_GSI : IB_QPT_SMI,
                                                              NULL, 0, send_handler,
-                                                             NULL, NULL);
+                                                             NULL, NULL, 0);
                                if (IS_ERR(agent)) {
                                        ret = PTR_ERR(agent);
                                        goto err;
index 0f7027e7db138f248dd46ab65a25a44fe06c2b9f..e1e558a3d692bbd8b907a84efe222481b0ae1a8e 100644 (file)
@@ -910,8 +910,7 @@ static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
        const struct default_rules *pdefault_rules = default_table;
        u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
 
-       for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++,
-            pdefault_rules++) {
+       for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
                __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
                memset(&field_types, 0, sizeof(field_types));
 
@@ -965,8 +964,7 @@ static int __mlx4_ib_create_default_rules(
        int size = 0;
        int i;
 
-       for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/
-                       sizeof(pdefault_rules->rules_create_list[0]); i++) {
+       for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
                int ret;
                union ib_flow_spec ib_spec;
                switch (pdefault_rules->rules_create_list[i]) {
@@ -2007,6 +2005,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
                (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
                (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
                (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
                (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
                (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
@@ -2059,6 +2058,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        ibdev->ib_dev.req_notify_cq     = mlx4_ib_arm_cq;
        ibdev->ib_dev.get_dma_mr        = mlx4_ib_get_dma_mr;
        ibdev->ib_dev.reg_user_mr       = mlx4_ib_reg_user_mr;
+       ibdev->ib_dev.rereg_user_mr     = mlx4_ib_rereg_user_mr;
        ibdev->ib_dev.dereg_mr          = mlx4_ib_dereg_mr;
        ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
        ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
index 369da3ca5d6484878e7b2af86e772cb94c1fc327..e8cad3926bfc350ba3d505327a29d19136e6ff10 100644 (file)
@@ -788,5 +788,9 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
 void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
 int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
                         int is_attach);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+                         u64 start, u64 length, u64 virt_addr,
+                         int mr_access_flags, struct ib_pd *pd,
+                         struct ib_udata *udata);
 
 #endif /* MLX4_IB_H */
index cb2a8727f3fb1160c64ceefe1f4ef99414de0dd6..9b0e80e59b087af9cac810977fbcefc7a939ffc6 100644 (file)
@@ -144,8 +144,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        if (!mr)
                return ERR_PTR(-ENOMEM);
 
+       /* Force registering the memory as writable. */
+       /* Used for memory re-registeration. HCA protects the access */
        mr->umem = ib_umem_get(pd->uobject->context, start, length,
-                              access_flags, 0);
+                              access_flags | IB_ACCESS_LOCAL_WRITE, 0);
        if (IS_ERR(mr->umem)) {
                err = PTR_ERR(mr->umem);
                goto err_free;
@@ -183,6 +185,90 @@ err_free:
        return ERR_PTR(err);
 }
 
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+                         u64 start, u64 length, u64 virt_addr,
+                         int mr_access_flags, struct ib_pd *pd,
+                         struct ib_udata *udata)
+{
+       struct mlx4_ib_dev *dev = to_mdev(mr->device);
+       struct mlx4_ib_mr *mmr = to_mmr(mr);
+       struct mlx4_mpt_entry *mpt_entry;
+       struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+       int err;
+
+       /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+        * we assume that the calls can't run concurrently. Otherwise, a
+        * race exists.
+        */
+       err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+       if (err)
+               return err;
+
+       if (flags & IB_MR_REREG_PD) {
+               err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+                                          to_mpd(pd)->pdn);
+
+               if (err)
+                       goto release_mpt_entry;
+       }
+
+       if (flags & IB_MR_REREG_ACCESS) {
+               err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+                                              convert_access(mr_access_flags));
+
+               if (err)
+                       goto release_mpt_entry;
+       }
+
+       if (flags & IB_MR_REREG_TRANS) {
+               int shift;
+               int err;
+               int n;
+
+               mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+               ib_umem_release(mmr->umem);
+               mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+                                       mr_access_flags |
+                                       IB_ACCESS_LOCAL_WRITE,
+                                       0);
+               if (IS_ERR(mmr->umem)) {
+                       err = PTR_ERR(mmr->umem);
+                       mmr->umem = NULL;
+                       goto release_mpt_entry;
+               }
+               n = ib_umem_page_count(mmr->umem);
+               shift = ilog2(mmr->umem->page_size);
+
+               mmr->mmr.iova       = virt_addr;
+               mmr->mmr.size       = length;
+               err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+                                             virt_addr, length, n, shift,
+                                             *pmpt_entry);
+               if (err) {
+                       ib_umem_release(mmr->umem);
+                       goto release_mpt_entry;
+               }
+
+               err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+               if (err) {
+                       mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+                       ib_umem_release(mmr->umem);
+                       goto release_mpt_entry;
+               }
+       }
+
+       /* If we couldn't transfer the MR to the HCA, just remember to
+        * return a failure. But dereg_mr will free the resources.
+        */
+       err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+
+release_mpt_entry:
+       mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+       return err;
+}
+
 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
 {
        struct mlx4_ib_mr *mr = to_mmr(ibmr);
index 7efe6e3f3542f39b79a69fdb4f5ffb55c6c93ea1..8c574b63d77b900768a71ffe334d5cf046b7d83d 100644 (file)
@@ -2501,7 +2501,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        spin_lock_irqsave(&qp->sq.lock, flags);
 
        for (nreq = 0; wr; nreq++, wr = wr->next) {
-               if (unlikely(wr->opcode >= sizeof(mlx5_ib_opcode) / sizeof(mlx5_ib_opcode[0]))) {
+               if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
                        mlx5_ib_warn(dev, "\n");
                        err = -EINVAL;
                        *bad_wr = wr;
index b6f7f457fc550b38fd4ee17aa861aff8aab80519..8881fa376e06fa6e87b9bde29a1e1581727a0341 100644 (file)
@@ -294,7 +294,7 @@ int mthca_create_agents(struct mthca_dev *dev)
                        agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
                                                      q ? IB_QPT_GSI : IB_QPT_SMI,
                                                      NULL, 0, send_handler,
-                                                     NULL, NULL);
+                                                     NULL, NULL, 0);
                        if (IS_ERR(agent)) {
                                ret = PTR_ERR(agent);
                                goto err;
index 19011dbb930fb38d899c6f365bd981ce94a70174..b43456ae124bccb99cfe78446b1e1ce347c8f2a6 100644 (file)
@@ -40,7 +40,7 @@
 #include <be_roce.h>
 #include "ocrdma_sli.h"
 
-#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u"
+#define OCRDMA_ROCE_DRV_VERSION "10.2.287.0u"
 
 #define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver"
 #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA"
@@ -137,6 +137,7 @@ struct mqe_ctx {
        u16 cqe_status;
        u16 ext_status;
        bool cmd_done;
+       bool fw_error_state;
 };
 
 struct ocrdma_hw_mr {
@@ -235,7 +236,10 @@ struct ocrdma_dev {
        struct list_head entry;
        struct rcu_head rcu;
        int id;
-       u64 stag_arr[OCRDMA_MAX_STAG];
+       u64 *stag_arr;
+       u8 sl; /* service level */
+       bool pfc_state;
+       atomic_t update_sl;
        u16 pvid;
        u32 asic_id;
 
@@ -518,4 +522,22 @@ static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev)
                                OCRDMA_SLI_ASIC_GEN_NUM_SHIFT;
 }
 
+static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio)
+{
+       return *(pfc + prio);
+}
+
+static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio)
+{
+       return *(app_prio + prio);
+}
+
+static inline u8 ocrdma_is_enabled_and_synced(u32 state)
+{      /* May also be used to interpret TC-state, QCN-state
+        * Appl-state and Logical-link-state in future.
+        */
+       return (state & OCRDMA_STATE_FLAG_ENABLED) &&
+               (state & OCRDMA_STATE_FLAG_SYNC);
+}
+
 #endif
index d4cc01f10c015654f966000ef28c6b7a82435232..40f8536c10b00e60db462ab9eaef26fd7a46945c 100644 (file)
@@ -35,6 +35,8 @@
 #include "ocrdma_ah.h"
 #include "ocrdma_hw.h"
 
+#define OCRDMA_VID_PCP_SHIFT   0xD
+
 static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
                                struct ib_ah_attr *attr, int pdid)
 {
@@ -55,7 +57,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
        if (vlan_tag && (vlan_tag < 0x1000)) {
                eth.eth_type = cpu_to_be16(0x8100);
                eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
-               vlan_tag |= (attr->sl & 7) << 13;
+               vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT;
                eth.vlan_tag = cpu_to_be16(vlan_tag);
                eth_sz = sizeof(struct ocrdma_eth_vlan);
                vlan_enabled = true;
@@ -100,6 +102,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
        if (!(attr->ah_flags & IB_AH_GRH))
                return ERR_PTR(-EINVAL);
 
+       if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+               ocrdma_init_service_level(dev);
        ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
        if (!ah)
                return ERR_PTR(-ENOMEM);
index 3bbf2010a82180e1f178e2af168dfd9aa20b195c..dd35ae558ae1ce1fe25e46679e7932d383493881 100644 (file)
@@ -525,7 +525,7 @@ static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
 
        cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
        cmd->eqn = eq->id;
-       cmd->cqe_count = cq->size / sizeof(struct ocrdma_mcqe);
+       cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe);
 
        ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE,
                             cq->dma, PAGE_SIZE_4K);
@@ -661,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
 {
        struct ocrdma_qp *qp = NULL;
        struct ocrdma_cq *cq = NULL;
-       struct ib_event ib_evt = { 0 };
+       struct ib_event ib_evt;
        int cq_event = 0;
        int qp_event = 1;
        int srq_event = 0;
@@ -674,6 +674,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
        if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID)
                cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK];
 
+       memset(&ib_evt, 0, sizeof(ib_evt));
+
        ib_evt.device = &dev->ibdev;
 
        switch (type) {
@@ -771,6 +773,10 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,
                                        OCRDMA_AE_PVID_MCQE_TAG_MASK) >>
                                        OCRDMA_AE_PVID_MCQE_TAG_SHIFT);
                break;
+
+       case OCRDMA_ASYNC_EVENT_COS_VALUE:
+               atomic_set(&dev->update_sl, 1);
+               break;
        default:
                /* Not interested evts. */
                break;
@@ -962,8 +968,12 @@ static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev)
                                    msecs_to_jiffies(30000));
        if (status)
                return 0;
-       else
+       else {
+               dev->mqe_ctx.fw_error_state = true;
+               pr_err("%s(%d) mailbox timeout: fw not responding\n",
+                      __func__, dev->id);
                return -1;
+       }
 }
 
 /* issue a mailbox command on the MQ */
@@ -975,6 +985,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)
        struct ocrdma_mbx_rsp *rsp = NULL;
 
        mutex_lock(&dev->mqe_ctx.lock);
+       if (dev->mqe_ctx.fw_error_state)
+               goto mbx_err;
        ocrdma_post_mqe(dev, mqe);
        status = ocrdma_wait_mqe_cmpl(dev);
        if (status)
@@ -1078,7 +1090,8 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,
            OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT;
        attr->max_mw = rsp->max_mw;
        attr->max_mr = rsp->max_mr;
-       attr->max_mr_size = ~0ull;
+       attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
+                             rsp->max_mr_size_lo;
        attr->max_fmr = 0;
        attr->max_pages_per_frmr = rsp->max_pages_per_frmr;
        attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
@@ -1252,7 +1265,9 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev)
                ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va;
                hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs;
 
-               dev->hba_port_num = hba_attribs->phy_port;
+               dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv &
+                                       OCRDMA_HBA_ATTRB_PTNUM_MASK)
+                                       >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT;
                strncpy(dev->model_number,
                        hba_attribs->controller_model_number, 31);
        }
@@ -1302,7 +1317,8 @@ int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed)
                goto mbx_err;
 
        rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
-       *lnk_speed = rsp->phys_port_speed;
+       *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
+                       >> OCRDMA_PHY_PS_SHIFT;
 
 mbx_err:
        kfree(cmd);
@@ -1328,11 +1344,16 @@ static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev)
                goto mbx_err;
 
        rsp = (struct ocrdma_get_phy_info_rsp *)cmd;
-       dev->phy.phy_type = le16_to_cpu(rsp->phy_type);
+       dev->phy.phy_type =
+                       (rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK);
+       dev->phy.interface_type =
+                       (rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK)
+                               >> OCRDMA_IF_TYPE_SHIFT;
        dev->phy.auto_speeds_supported  =
-                       le16_to_cpu(rsp->auto_speeds_supported);
+                       (rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK);
        dev->phy.fixed_speeds_supported =
-                       le16_to_cpu(rsp->fixed_speeds_supported);
+                       (rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK)
+                               >> OCRDMA_FSPEED_SUPP_SHIFT;
 mbx_err:
        kfree(cmd);
        return status;
@@ -1457,8 +1478,8 @@ static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)
 
        pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
        for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
-               pbes[i].pa_lo = (u32) (pa & 0xffffffff);
-               pbes[i].pa_hi = (u32) upper_32_bits(pa);
+               pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff);
+               pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa));
                pa += PAGE_SIZE;
        }
        cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF);
@@ -1501,6 +1522,7 @@ static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev)
        ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
        dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
                          dev->av_tbl.pa);
+       dev->av_tbl.va = NULL;
        dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
                          dev->av_tbl.pbl.pa);
        kfree(cmd);
@@ -1624,14 +1646,16 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
                        cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP <<
                                OCRDMA_CREATE_CQ_TYPE_SHIFT;
                cq->phase_change = false;
-               cmd->cmd.cqe_count = (cq->len / cqe_size);
+               cmd->cmd.pdid_cqecnt = (cq->len / cqe_size);
        } else {
-               cmd->cmd.cqe_count = (cq->len / cqe_size) - 1;
+               cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1;
                cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID;
                cq->phase_change = true;
        }
 
-       cmd->cmd.pd_id = pd_id; /* valid only for v3 */
+       /* pd_id valid only for v3 */
+       cmd->cmd.pdid_cqecnt |= (pd_id <<
+               OCRDMA_CREATE_CQ_CMD_PDID_SHIFT);
        ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size);
        status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
        if (status)
@@ -2206,7 +2230,8 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,
                                OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK;
        qp->rq_cq = cq;
 
-       if (pd->dpp_enabled && pd->num_dpp_qp) {
+       if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp &&
+           (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) {
                ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,
                                             dpp_cq_id);
        }
@@ -2264,6 +2289,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 
        if ((ah_attr->ah_flags & IB_AH_GRH) == 0)
                return -EINVAL;
+       if (atomic_cmpxchg(&qp->dev->update_sl, 1, 0))
+               ocrdma_init_service_level(qp->dev);
        cmd->params.tclass_sq_psn |=
            (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT);
        cmd->params.rnt_rc_sl_fl |=
@@ -2297,6 +2324,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
                cmd->params.vlan_dmac_b4_to_b5 |=
                    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
                cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
+               cmd->params.rnt_rc_sl_fl |=
+                       (qp->dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT;
        }
        return 0;
 }
@@ -2604,6 +2633,168 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
        return status;
 }
 
+static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
+                                     struct ocrdma_dcbx_cfg *dcbxcfg)
+{
+       int status = 0;
+       dma_addr_t pa;
+       struct ocrdma_mqe cmd;
+
+       struct ocrdma_get_dcbx_cfg_req *req = NULL;
+       struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL;
+       struct pci_dev *pdev = dev->nic_info.pdev;
+       struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge;
+
+       memset(&cmd, 0, sizeof(struct ocrdma_mqe));
+       cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp),
+                                       sizeof(struct ocrdma_get_dcbx_cfg_req));
+       req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL);
+       if (!req) {
+               status = -ENOMEM;
+               goto mem_err;
+       }
+
+       cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+                                       OCRDMA_MQE_HDR_SGE_CNT_MASK;
+       mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL);
+       mqe_sge->pa_hi = (u32) upper_32_bits(pa);
+       mqe_sge->len = cmd.hdr.pyld_len;
+
+       memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
+       ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
+                       OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
+       req->param_type = ptype;
+
+       status = ocrdma_mbx_cmd(dev, &cmd);
+       if (status)
+               goto mbx_err;
+
+       rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req;
+       ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp));
+       memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg));
+
+mbx_err:
+       dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa);
+mem_err:
+       return status;
+}
+
+#define OCRDMA_MAX_SERVICE_LEVEL_INDEX 0x08
+#define OCRDMA_DEFAULT_SERVICE_LEVEL   0x05
+
+static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype,
+                                   struct ocrdma_dcbx_cfg *dcbxcfg,
+                                   u8 *srvc_lvl)
+{
+       int status = -EINVAL, indx, slindx;
+       int ventry_cnt;
+       struct ocrdma_app_parameter *app_param;
+       u8 valid, proto_sel;
+       u8 app_prio, pfc_prio;
+       u16 proto;
+
+       if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) {
+               pr_info("%s ocrdma%d DCBX is disabled\n",
+                       dev_name(&dev->nic_info.pdev->dev), dev->id);
+               goto out;
+       }
+
+       if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) {
+               pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n",
+                       dev_name(&dev->nic_info.pdev->dev), dev->id,
+                       (ptype > 0 ? "operational" : "admin"),
+                       (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ?
+                       "enabled" : "disabled",
+                       (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ?
+                       "" : ", not sync'ed");
+               goto out;
+       } else {
+               pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n",
+                       dev_name(&dev->nic_info.pdev->dev), dev->id);
+       }
+
+       ventry_cnt = (dcbxcfg->tcv_aev_opv_st >>
+                               OCRDMA_DCBX_APP_ENTRY_SHIFT)
+                               & OCRDMA_DCBX_STATE_MASK;
+
+       for (indx = 0; indx < ventry_cnt; indx++) {
+               app_param = &dcbxcfg->app_param[indx];
+               valid = (app_param->valid_proto_app >>
+                               OCRDMA_APP_PARAM_VALID_SHIFT)
+                               & OCRDMA_APP_PARAM_VALID_MASK;
+               proto_sel = (app_param->valid_proto_app
+                               >>  OCRDMA_APP_PARAM_PROTO_SEL_SHIFT)
+                               & OCRDMA_APP_PARAM_PROTO_SEL_MASK;
+               proto = app_param->valid_proto_app &
+                               OCRDMA_APP_PARAM_APP_PROTO_MASK;
+
+               if (
+                       valid && proto == OCRDMA_APP_PROTO_ROCE &&
+                       proto_sel == OCRDMA_PROTO_SELECT_L2) {
+                       for (slindx = 0; slindx <
+                               OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) {
+                               app_prio = ocrdma_get_app_prio(
+                                               (u8 *)app_param->app_prio,
+                                               slindx);
+                               pfc_prio = ocrdma_get_pfc_prio(
+                                               (u8 *)dcbxcfg->pfc_prio,
+                                               slindx);
+
+                               if (app_prio && pfc_prio) {
+                                       *srvc_lvl = slindx;
+                                       status = 0;
+                                       goto out;
+                               }
+                       }
+                       if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) {
+                               pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n",
+                                       dev_name(&dev->nic_info.pdev->dev),
+                                       dev->id, proto);
+                       }
+               }
+       }
+
+out:
+       return status;
+}
+
+void ocrdma_init_service_level(struct ocrdma_dev *dev)
+{
+       int status = 0, indx;
+       struct ocrdma_dcbx_cfg dcbxcfg;
+       u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL;
+       int ptype = OCRDMA_PARAMETER_TYPE_OPER;
+
+       for (indx = 0; indx < 2; indx++) {
+               status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg);
+               if (status) {
+                       pr_err("%s(): status=%d\n", __func__, status);
+                       ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+                       continue;
+               }
+
+               status = ocrdma_parse_dcbxcfg_rsp(dev, ptype,
+                                                 &dcbxcfg, &srvc_lvl);
+               if (status) {
+                       ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+                       continue;
+               }
+
+               break;
+       }
+
+       if (status)
+               pr_info("%s ocrdma%d service level default\n",
+                       dev_name(&dev->nic_info.pdev->dev), dev->id);
+       else
+               pr_info("%s ocrdma%d service level %d\n",
+                       dev_name(&dev->nic_info.pdev->dev), dev->id,
+                       srvc_lvl);
+
+       dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state);
+       dev->sl = srvc_lvl;
+}
+
 int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
 {
        int i;
@@ -2709,13 +2900,15 @@ int ocrdma_init_hw(struct ocrdma_dev *dev)
                goto conf_err;
        status = ocrdma_mbx_get_phy_info(dev);
        if (status)
-               goto conf_err;
+               goto info_attrb_err;
        status = ocrdma_mbx_get_ctrl_attribs(dev);
        if (status)
-               goto conf_err;
+               goto info_attrb_err;
 
        return 0;
 
+info_attrb_err:
+       ocrdma_mbx_delete_ah_tbl(dev);
 conf_err:
        ocrdma_destroy_mq(dev);
 mq_err:
index e513f7293142e036b7872424add7b961c5caaf77..6eed8f191322a134fc0dcd1438cf771525c06a18 100644 (file)
@@ -135,4 +135,6 @@ int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq);
 
 int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset);
 char *port_speed_string(struct ocrdma_dev *dev);
+void ocrdma_init_service_level(struct ocrdma_dev *);
+
 #endif                         /* __OCRDMA_HW_H__ */
index 7c504e079744f44425a9bba90d347412ef192bf7..256a06bc0b68478fee187302a0467330fadf3cc9 100644 (file)
@@ -324,6 +324,11 @@ static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
                if (!dev->qp_tbl)
                        goto alloc_err;
        }
+
+       dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL);
+       if (dev->stag_arr == NULL)
+               goto alloc_err;
+
        spin_lock_init(&dev->av_tbl.lock);
        spin_lock_init(&dev->flush_q_lock);
        return 0;
@@ -334,6 +339,7 @@ alloc_err:
 
 static void ocrdma_free_resources(struct ocrdma_dev *dev)
 {
+       kfree(dev->stag_arr);
        kfree(dev->qp_tbl);
        kfree(dev->cq_tbl);
        kfree(dev->sgid_tbl);
@@ -353,15 +359,25 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
 {
        struct ocrdma_dev *dev = dev_get_drvdata(device);
 
-       return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]);
+       return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
+}
+
+static ssize_t show_hca_type(struct device *device,
+                            struct device_attribute *attr, char *buf)
+{
+       struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+       return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
 static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
 
 static struct device_attribute *ocrdma_attributes[] = {
        &dev_attr_hw_rev,
-       &dev_attr_fw_ver
+       &dev_attr_fw_ver,
+       &dev_attr_hca_type
 };
 
 static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
@@ -372,6 +388,58 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
                device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
 }
 
+static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
+                                 struct net_device *net)
+{
+       struct in_device *in_dev;
+       union ib_gid gid;
+       in_dev = in_dev_get(net);
+       if (in_dev) {
+               for_ifa(in_dev) {
+                       ipv6_addr_set_v4mapped(ifa->ifa_address,
+                                              (struct in6_addr *)&gid);
+                       ocrdma_add_sgid(dev, &gid);
+               }
+               endfor_ifa(in_dev);
+               in_dev_put(in_dev);
+       }
+}
+
+static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
+                                 struct net_device *net)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+       struct inet6_dev *in6_dev;
+       union ib_gid  *pgid;
+       struct inet6_ifaddr *ifp;
+       in6_dev = in6_dev_get(net);
+       if (in6_dev) {
+               read_lock_bh(&in6_dev->lock);
+               list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+                       pgid = (union ib_gid *)&ifp->addr;
+                       ocrdma_add_sgid(dev, pgid);
+               }
+               read_unlock_bh(&in6_dev->lock);
+               in6_dev_put(in6_dev);
+       }
+#endif
+}
+
+static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
+{
+       struct  net_device *net_dev;
+
+       for_each_netdev(&init_net, net_dev) {
+               struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
+                               rdma_vlan_dev_real_dev(net_dev) : net_dev;
+
+               if (real_dev == dev->nic_info.netdev) {
+                       ocrdma_init_ipv4_gids(dev, net_dev);
+                       ocrdma_init_ipv6_gids(dev, net_dev);
+               }
+       }
+}
+
 static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 {
        int status = 0, i;
@@ -399,6 +467,8 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
        if (status)
                goto alloc_err;
 
+       ocrdma_init_service_level(dev);
+       ocrdma_init_gid_table(dev);
        status = ocrdma_register_device(dev);
        if (status)
                goto alloc_err;
@@ -508,6 +578,12 @@ static int ocrdma_close(struct ocrdma_dev *dev)
        return 0;
 }
 
+static void ocrdma_shutdown(struct ocrdma_dev *dev)
+{
+       ocrdma_close(dev);
+       ocrdma_remove(dev);
+}
+
 /* event handling via NIC driver ensures that all the NIC specific
  * initialization done before RoCE driver notifies
  * event to stack.
@@ -521,6 +597,9 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)
        case BE_DEV_DOWN:
                ocrdma_close(dev);
                break;
+       case BE_DEV_SHUTDOWN:
+               ocrdma_shutdown(dev);
+               break;
        }
 }
 
index 96c9ee602ba49bead98ba33ef2fbd37478abe830..904989ec5eaa67b796abcf221292c5f70246a642 100644 (file)
@@ -44,35 +44,39 @@ enum {
 #define OCRDMA_SUBSYS_ROCE 10
 enum {
        OCRDMA_CMD_QUERY_CONFIG = 1,
-       OCRDMA_CMD_ALLOC_PD,
-       OCRDMA_CMD_DEALLOC_PD,
-
-       OCRDMA_CMD_CREATE_AH_TBL,
-       OCRDMA_CMD_DELETE_AH_TBL,
-
-       OCRDMA_CMD_CREATE_QP,
-       OCRDMA_CMD_QUERY_QP,
-       OCRDMA_CMD_MODIFY_QP,
-       OCRDMA_CMD_DELETE_QP,
-
-       OCRDMA_CMD_RSVD1,
-       OCRDMA_CMD_ALLOC_LKEY,
-       OCRDMA_CMD_DEALLOC_LKEY,
-       OCRDMA_CMD_REGISTER_NSMR,
-       OCRDMA_CMD_REREGISTER_NSMR,
-       OCRDMA_CMD_REGISTER_NSMR_CONT,
-       OCRDMA_CMD_QUERY_NSMR,
-       OCRDMA_CMD_ALLOC_MW,
-       OCRDMA_CMD_QUERY_MW,
-
-       OCRDMA_CMD_CREATE_SRQ,
-       OCRDMA_CMD_QUERY_SRQ,
-       OCRDMA_CMD_MODIFY_SRQ,
-       OCRDMA_CMD_DELETE_SRQ,
-
-       OCRDMA_CMD_ATTACH_MCAST,
-       OCRDMA_CMD_DETACH_MCAST,
-       OCRDMA_CMD_GET_RDMA_STATS,
+       OCRDMA_CMD_ALLOC_PD = 2,
+       OCRDMA_CMD_DEALLOC_PD = 3,
+
+       OCRDMA_CMD_CREATE_AH_TBL = 4,
+       OCRDMA_CMD_DELETE_AH_TBL = 5,
+
+       OCRDMA_CMD_CREATE_QP = 6,
+       OCRDMA_CMD_QUERY_QP = 7,
+       OCRDMA_CMD_MODIFY_QP = 8 ,
+       OCRDMA_CMD_DELETE_QP = 9,
+
+       OCRDMA_CMD_RSVD1 = 10,
+       OCRDMA_CMD_ALLOC_LKEY = 11,
+       OCRDMA_CMD_DEALLOC_LKEY = 12,
+       OCRDMA_CMD_REGISTER_NSMR = 13,
+       OCRDMA_CMD_REREGISTER_NSMR = 14,
+       OCRDMA_CMD_REGISTER_NSMR_CONT = 15,
+       OCRDMA_CMD_QUERY_NSMR = 16,
+       OCRDMA_CMD_ALLOC_MW = 17,
+       OCRDMA_CMD_QUERY_MW = 18,
+
+       OCRDMA_CMD_CREATE_SRQ = 19,
+       OCRDMA_CMD_QUERY_SRQ = 20,
+       OCRDMA_CMD_MODIFY_SRQ = 21,
+       OCRDMA_CMD_DELETE_SRQ = 22,
+
+       OCRDMA_CMD_ATTACH_MCAST = 23,
+       OCRDMA_CMD_DETACH_MCAST = 24,
+
+       OCRDMA_CMD_CREATE_RBQ = 25,
+       OCRDMA_CMD_DESTROY_RBQ = 26,
+
+       OCRDMA_CMD_GET_RDMA_STATS = 27,
 
        OCRDMA_CMD_MAX
 };
@@ -103,7 +107,7 @@ enum {
 
 #define OCRDMA_MAX_QP    2048
 #define OCRDMA_MAX_CQ    2048
-#define OCRDMA_MAX_STAG  8192
+#define OCRDMA_MAX_STAG 16384
 
 enum {
        OCRDMA_DB_RQ_OFFSET             = 0xE0,
@@ -422,7 +426,12 @@ struct ocrdma_ae_qp_mcqe {
 
 #define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14
 #define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5
-#define OCRDMA_ASYNC_EVENT_PVID_STATE 0x3
+
+enum ocrdma_async_grp5_events {
+       OCRDMA_ASYNC_EVENT_QOS_VALUE    = 0x01,
+       OCRDMA_ASYNC_EVENT_COS_VALUE    = 0x02,
+       OCRDMA_ASYNC_EVENT_PVID_STATE   = 0x03
+};
 
 enum OCRDMA_ASYNC_EVENT_TYPE {
        OCRDMA_CQ_ERROR                 = 0x00,
@@ -525,8 +534,8 @@ struct ocrdma_mbx_query_config {
        u32 max_ird_ord_per_qp;
        u32 max_shared_ird_ord;
        u32 max_mr;
-       u32 max_mr_size_lo;
        u32 max_mr_size_hi;
+       u32 max_mr_size_lo;
        u32 max_num_mr_pbl;
        u32 max_mw;
        u32 max_fmr;
@@ -580,17 +589,26 @@ enum {
        OCRDMA_FN_MODE_RDMA     = 0x4
 };
 
+enum {
+       OCRDMA_IF_TYPE_MASK             = 0xFFFF0000,
+       OCRDMA_IF_TYPE_SHIFT            = 0x10,
+       OCRDMA_PHY_TYPE_MASK            = 0x0000FFFF,
+       OCRDMA_FUTURE_DETAILS_MASK      = 0xFFFF0000,
+       OCRDMA_FUTURE_DETAILS_SHIFT     = 0x10,
+       OCRDMA_EX_PHY_DETAILS_MASK      = 0x0000FFFF,
+       OCRDMA_FSPEED_SUPP_MASK         = 0xFFFF0000,
+       OCRDMA_FSPEED_SUPP_SHIFT        = 0x10,
+       OCRDMA_ASPEED_SUPP_MASK         = 0x0000FFFF
+};
+
 struct ocrdma_get_phy_info_rsp {
        struct ocrdma_mqe_hdr hdr;
        struct ocrdma_mbx_rsp rsp;
 
-       u16 phy_type;
-       u16 interface_type;
+       u32 ityp_ptyp;
        u32 misc_params;
-       u16 ext_phy_details;
-       u16 rsvd;
-       u16 auto_speeds_supported;
-       u16 fixed_speeds_supported;
+       u32 ftrdtl_exphydtl;
+       u32 fspeed_aspeed;
        u32 future_use[2];
 };
 
@@ -603,19 +621,34 @@ enum {
        OCRDMA_PHY_SPEED_40GBPS = 0x20
 };
 
+enum {
+       OCRDMA_PORT_NUM_MASK    = 0x3F,
+       OCRDMA_PT_MASK          = 0xC0,
+       OCRDMA_PT_SHIFT         = 0x6,
+       OCRDMA_LINK_DUP_MASK    = 0x0000FF00,
+       OCRDMA_LINK_DUP_SHIFT   = 0x8,
+       OCRDMA_PHY_PS_MASK      = 0x00FF0000,
+       OCRDMA_PHY_PS_SHIFT     = 0x10,
+       OCRDMA_PHY_PFLT_MASK    = 0xFF000000,
+       OCRDMA_PHY_PFLT_SHIFT   = 0x18,
+       OCRDMA_QOS_LNKSP_MASK   = 0xFFFF0000,
+       OCRDMA_QOS_LNKSP_SHIFT  = 0x10,
+       OCRDMA_LLST_MASK        = 0xFF,
+       OCRDMA_PLFC_MASK        = 0x00000400,
+       OCRDMA_PLFC_SHIFT       = 0x8,
+       OCRDMA_PLRFC_MASK       = 0x00000200,
+       OCRDMA_PLRFC_SHIFT      = 0x8,
+       OCRDMA_PLTFC_MASK       = 0x00000100,
+       OCRDMA_PLTFC_SHIFT      = 0x8
+};
 
 struct ocrdma_get_link_speed_rsp {
        struct ocrdma_mqe_hdr hdr;
        struct ocrdma_mbx_rsp rsp;
 
-       u8 pt_port_num;
-       u8 link_duplex;
-       u8 phys_port_speed;
-       u8 phys_port_fault;
-       u16 rsvd1;
-       u16 qos_lnk_speed;
-       u8 logical_lnk_status;
-       u8 rsvd2[3];
+       u32 pflt_pps_ld_pnum;
+       u32 qos_lsp;
+       u32 res_lls;
 };
 
 enum {
@@ -666,8 +699,7 @@ struct ocrdma_create_cq_cmd {
        u32 pgsz_pgcnt;
        u32 ev_cnt_flags;
        u32 eqn;
-       u16 cqe_count;
-       u16 pd_id;
+       u32 pdid_cqecnt;
        u32 rsvd6;
        struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES];
 };
@@ -677,6 +709,10 @@ struct ocrdma_create_cq {
        struct ocrdma_create_cq_cmd cmd;
 };
 
+enum {
+       OCRDMA_CREATE_CQ_CMD_PDID_SHIFT = 0x10
+};
+
 enum {
        OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF
 };
@@ -1231,7 +1267,6 @@ struct ocrdma_destroy_srq {
 
 enum {
        OCRDMA_ALLOC_PD_ENABLE_DPP      = BIT(16),
-       OCRDMA_PD_MAX_DPP_ENABLED_QP    = 8,
        OCRDMA_DPP_PAGE_SIZE            = 4096
 };
 
@@ -1896,12 +1931,62 @@ struct ocrdma_rdma_stats_resp {
        struct ocrdma_rx_dbg_stats      rx_dbg_stats;
 } __packed;
 
+enum {
+       OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK      = 0xFF,
+       OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK      = 0xFF00,
+       OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT     = 0x08,
+       OCRDMA_HBA_ATTRB_CDBLEN_MASK            = 0xFFFF,
+       OCRDMA_HBA_ATTRB_ASIC_REV_MASK          = 0xFF0000,
+       OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT         = 0x10,
+       OCRDMA_HBA_ATTRB_GUID0_MASK             = 0xFF000000,
+       OCRDMA_HBA_ATTRB_GUID0_SHIFT            = 0x18,
+       OCRDMA_HBA_ATTRB_GUID13_MASK            = 0xFF,
+       OCRDMA_HBA_ATTRB_GUID14_MASK            = 0xFF00,
+       OCRDMA_HBA_ATTRB_GUID14_SHIFT           = 0x08,
+       OCRDMA_HBA_ATTRB_GUID15_MASK            = 0xFF0000,
+       OCRDMA_HBA_ATTRB_GUID15_SHIFT           = 0x10,
+       OCRDMA_HBA_ATTRB_PCNT_MASK              = 0xFF000000,
+       OCRDMA_HBA_ATTRB_PCNT_SHIFT             = 0x18,
+       OCRDMA_HBA_ATTRB_LDTOUT_MASK            = 0xFFFF,
+       OCRDMA_HBA_ATTRB_ISCSI_VER_MASK         = 0xFF0000,
+       OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT        = 0x10,
+       OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK         = 0xFF000000,
+       OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT        = 0x18,
+       OCRDMA_HBA_ATTRB_CV_MASK                = 0xFF,
+       OCRDMA_HBA_ATTRB_HBA_ST_MASK            = 0xFF00,
+       OCRDMA_HBA_ATTRB_HBA_ST_SHIFT           = 0x08,
+       OCRDMA_HBA_ATTRB_MAX_DOMS_MASK          = 0xFF0000,
+       OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT         = 0x10,
+       OCRDMA_HBA_ATTRB_PTNUM_MASK             = 0x3F000000,
+       OCRDMA_HBA_ATTRB_PTNUM_SHIFT            = 0x18,
+       OCRDMA_HBA_ATTRB_PT_MASK                = 0xC0000000,
+       OCRDMA_HBA_ATTRB_PT_SHIFT               = 0x1E,
+       OCRDMA_HBA_ATTRB_ISCSI_FET_MASK         = 0xFF,
+       OCRDMA_HBA_ATTRB_ASIC_GEN_MASK          = 0xFF00,
+       OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT         = 0x08,
+       OCRDMA_HBA_ATTRB_PCI_VID_MASK           = 0xFFFF,
+       OCRDMA_HBA_ATTRB_PCI_DID_MASK           = 0xFFFF0000,
+       OCRDMA_HBA_ATTRB_PCI_DID_SHIFT          = 0x10,
+       OCRDMA_HBA_ATTRB_PCI_SVID_MASK          = 0xFFFF,
+       OCRDMA_HBA_ATTRB_PCI_SSID_MASK          = 0xFFFF0000,
+       OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT         = 0x10,
+       OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK        = 0xFF,
+       OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK        = 0xFF00,
+       OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT       = 0x08,
+       OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK       = 0xFF0000,
+       OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT      = 0x10,
+       OCRDMA_HBA_ATTRB_IF_TYPE_MASK           = 0xFF000000,
+       OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT          = 0x18,
+       OCRDMA_HBA_ATTRB_NETFIL_MASK            =0xFF
+};
 
 struct mgmt_hba_attribs {
        u8 flashrom_version_string[32];
        u8 manufacturer_name[32];
        u32 supported_modes;
-       u32 rsvd0[3];
+       u32 rsvd_eprom_verhi_verlo;
+       u32 mbx_ds_ver;
+       u32 epfw_ds_ver;
        u8 ncsi_ver_string[12];
        u32 default_extended_timeout;
        u8 controller_model_number[32];
@@ -1914,34 +1999,26 @@ struct mgmt_hba_attribs {
        u8 driver_version_string[32];
        u8 fw_on_flash_version_string[32];
        u32 functionalities_supported;
-       u16 max_cdblength;
-       u8 asic_revision;
-       u8 generational_guid[16];
-       u8 hba_port_count;
-       u16 default_link_down_timeout;
-       u8 iscsi_ver_min_max;
-       u8 multifunction_device;
-       u8 cache_valid;
-       u8 hba_status;
-       u8 max_domains_supported;
-       u8 phy_port;
+       u32 guid0_asicrev_cdblen;
+       u8 generational_guid[12];
+       u32 portcnt_guid15;
+       u32 mfuncdev_iscsi_ldtout;
+       u32 ptpnum_maxdoms_hbast_cv;
        u32 firmware_post_status;
        u32 hba_mtu[8];
-       u32 rsvd1[4];
+       u32 res_asicgen_iscsi_feaures;
+       u32 rsvd1[3];
 };
 
 struct mgmt_controller_attrib {
        struct mgmt_hba_attribs hba_attribs;
-       u16 pci_vendor_id;
-       u16 pci_device_id;
-       u16 pci_sub_vendor_id;
-       u16 pci_sub_system_id;
-       u8 pci_bus_number;
-       u8 pci_device_number;
-       u8 pci_function_number;
-       u8 interface_type;
-       u64 unique_identifier;
-       u32 rsvd0[5];
+       u32 pci_did_vid;
+       u32 pci_ssid_svid;
+       u32 ityp_fnum_devnum_bnum;
+       u32 uid_hi;
+       u32 uid_lo;
+       u32 res_nnetfil;
+       u32 rsvd0[4];
 };
 
 struct ocrdma_get_ctrl_attribs_rsp {
@@ -1949,5 +2026,79 @@ struct ocrdma_get_ctrl_attribs_rsp {
        struct mgmt_controller_attrib ctrl_attribs;
 };
 
+#define OCRDMA_SUBSYS_DCBX 0x10
+
+enum OCRDMA_DCBX_OPCODE {
+       OCRDMA_CMD_GET_DCBX_CONFIG = 0x01
+};
+
+enum OCRDMA_DCBX_PARAM_TYPE {
+       OCRDMA_PARAMETER_TYPE_ADMIN     = 0x00,
+       OCRDMA_PARAMETER_TYPE_OPER      = 0x01,
+       OCRDMA_PARAMETER_TYPE_PEER      = 0x02
+};
+
+enum OCRDMA_DCBX_APP_PROTO {
+       OCRDMA_APP_PROTO_ROCE   = 0x8915
+};
+
+enum OCRDMA_DCBX_PROTO {
+       OCRDMA_PROTO_SELECT_L2  = 0x00,
+       OCRDMA_PROTO_SELECT_L4  = 0x01
+};
+
+enum OCRDMA_DCBX_APP_PARAM {
+       OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF,
+       OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF,
+       OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10,
+       OCRDMA_APP_PARAM_VALID_MASK     = 0xFF,
+       OCRDMA_APP_PARAM_VALID_SHIFT    = 0x18
+};
+
+enum OCRDMA_DCBX_STATE_FLAGS {
+       OCRDMA_STATE_FLAG_ENABLED       = 0x01,
+       OCRDMA_STATE_FLAG_ADDVERTISED   = 0x02,
+       OCRDMA_STATE_FLAG_WILLING       = 0x04,
+       OCRDMA_STATE_FLAG_SYNC          = 0x08,
+       OCRDMA_STATE_FLAG_UNSUPPORTED   = 0x40000000,
+       OCRDMA_STATE_FLAG_NEG_FAILD     = 0x80000000
+};
+
+enum OCRDMA_TCV_AEV_OPV_ST {
+       OCRDMA_DCBX_TC_SUPPORT_MASK     = 0xFF,
+       OCRDMA_DCBX_TC_SUPPORT_SHIFT    = 0x18,
+       OCRDMA_DCBX_APP_ENTRY_SHIFT     = 0x10,
+       OCRDMA_DCBX_OP_PARAM_SHIFT      = 0x08,
+       OCRDMA_DCBX_STATE_MASK          = 0xFF
+};
+
+struct ocrdma_app_parameter {
+       u32 valid_proto_app;
+       u32 oui;
+       u32 app_prio[2];
+};
+
+struct ocrdma_dcbx_cfg {
+       u32 tcv_aev_opv_st;
+       u32 tc_state;
+       u32 pfc_state;
+       u32 qcn_state;
+       u32 appl_state;
+       u32 ll_state;
+       u32 tc_bw[2];
+       u32 tc_prio[8];
+       u32 pfc_prio[2];
+       struct ocrdma_app_parameter app_param[15];
+};
+
+struct ocrdma_get_dcbx_cfg_req {
+       struct ocrdma_mbx_hdr hdr;
+       u32 param_type;
+} __packed;
+
+struct ocrdma_get_dcbx_cfg_rsp {
+       struct ocrdma_mbx_rsp hdr;
+       struct ocrdma_dcbx_cfg cfg;
+} __packed;
 
 #endif                         /* __OCRDMA_SLI_H__ */
index edf6211d84b8ec5f7b9d22da5f7f6170f0c810ef..acb434d169036e0781eef89a10d6c72c32936554 100644 (file)
@@ -69,11 +69,11 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
        memcpy(&attr->fw_ver, &dev->attr.fw_ver[0],
               min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver)));
        ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid);
-       attr->max_mr_size = ~0ull;
+       attr->max_mr_size = dev->attr.max_mr_size;
        attr->page_size_cap = 0xffff000;
        attr->vendor_id = dev->nic_info.pdev->vendor;
        attr->vendor_part_id = dev->nic_info.pdev->device;
-       attr->hw_ver = 0;
+       attr->hw_ver = dev->asic_id;
        attr->max_qp = dev->attr.max_qp;
        attr->max_ah = OCRDMA_MAX_AH;
        attr->max_qp_wr = dev->attr.max_wqe;
@@ -268,7 +268,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
                pd->dpp_enabled =
                        ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
                pd->num_dpp_qp =
-                       pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0;
+                       pd->dpp_enabled ? (dev->nic_info.db_page_size /
+                                          dev->attr.wqe_size) : 0;
        }
 
 retry:
@@ -328,7 +329,10 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
        struct ocrdma_pd *pd = uctx->cntxt_pd;
        struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
 
-       BUG_ON(uctx->pd_in_use);
+       if (uctx->pd_in_use) {
+               pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
+                      __func__, dev->id, pd->id);
+       }
        uctx->cntxt_pd = NULL;
        status = _ocrdma_dealloc_pd(dev, pd);
        return status;
@@ -843,6 +847,13 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr)
        if (mr->umem)
                ib_umem_release(mr->umem);
        kfree(mr);
+
+       /* Don't stop cleanup, in case FW is unresponsive */
+       if (dev->mqe_ctx.fw_error_state) {
+               status = 0;
+               pr_err("%s(%d) fw not responding.\n",
+                      __func__, dev->id);
+       }
        return status;
 }
 
@@ -2054,6 +2065,13 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        }
 
        while (wr) {
+               if (qp->qp_type == IB_QPT_UD &&
+                   (wr->opcode != IB_WR_SEND &&
+                    wr->opcode != IB_WR_SEND_WITH_IMM)) {
+                       *bad_wr = wr;
+                       status = -EINVAL;
+                       break;
+               }
                if (ocrdma_hwq_free_cnt(&qp->sq) == 0 ||
                    wr->num_sge > qp->sq.max_sges) {
                        *bad_wr = wr;
@@ -2488,6 +2506,11 @@ static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp,
                        *stop = true;
                        expand = false;
                }
+       } else if (is_hw_sq_empty(qp)) {
+               /* Do nothing */
+               expand = false;
+               *polled = false;
+               *stop = false;
        } else {
                *polled = true;
                expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
@@ -2593,6 +2616,11 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
                        *stop = true;
                        expand = false;
                }
+       } else if (is_hw_rq_empty(qp)) {
+               /* Do nothing */
+               expand = false;
+               *polled = false;
+               *stop = false;
        } else {
                *polled = true;
                expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
index 22c720e5740d900d3d4f1a6de4913d95708a8f54..636be117b57859e690fdb0704be6fe10b429511f 100644 (file)
@@ -2476,7 +2476,7 @@ int qib_create_agents(struct qib_ibdev *dev)
                ibp = &dd->pport[p].ibport_data;
                agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
                                              NULL, 0, send_handler,
-                                             NULL, NULL);
+                                             NULL, NULL, 0);
                if (IS_ERR(agent)) {
                        ret = PTR_ERR(agent);
                        goto err;
index c639f90cfda41709ce77c4051b8ed8086f4dd700..3edce617c31b22de7e4e7f5d2478c5bd71ba3f46 100644 (file)
@@ -86,7 +86,6 @@ enum {
        IPOIB_FLAG_INITIALIZED    = 1,
        IPOIB_FLAG_ADMIN_UP       = 2,
        IPOIB_PKEY_ASSIGNED       = 3,
-       IPOIB_PKEY_STOP           = 4,
        IPOIB_FLAG_SUBINTERFACE   = 5,
        IPOIB_MCAST_RUN           = 6,
        IPOIB_STOP_REAPER         = 7,
@@ -312,7 +311,6 @@ struct ipoib_dev_priv {
        struct list_head multicast_list;
        struct rb_root multicast_tree;
 
-       struct delayed_work pkey_poll_task;
        struct delayed_work mcast_task;
        struct work_struct carrier_on_task;
        struct work_struct flush_light;
@@ -473,10 +471,11 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
-int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_open(struct net_device *dev, int flush);
 int ipoib_ib_dev_up(struct net_device *dev);
 int ipoib_ib_dev_down(struct net_device *dev, int flush);
 int ipoib_ib_dev_stop(struct net_device *dev, int flush);
+void ipoib_pkey_dev_check_presence(struct net_device *dev);
 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
@@ -532,8 +531,7 @@ int  ipoib_set_mode(struct net_device *dev, const char *buf);
 
 void ipoib_setup(struct net_device *dev);
 
-void ipoib_pkey_poll(struct work_struct *work);
-int ipoib_pkey_dev_delay_open(struct net_device *dev);
+void ipoib_pkey_open(struct ipoib_dev_priv *priv);
 void ipoib_drain_cq(struct net_device *dev);
 
 void ipoib_set_ethtool_ops(struct net_device *dev);
index 50061854616ecb9c5ad409e73bf20889a2eab680..6bd5740e26913df2662bad05a754ad6e88b3bb50 100644 (file)
@@ -281,10 +281,8 @@ void ipoib_delete_debug_files(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-       if (priv->mcg_dentry)
-               debugfs_remove(priv->mcg_dentry);
-       if (priv->path_dentry)
-               debugfs_remove(priv->path_dentry);
+       debugfs_remove(priv->mcg_dentry);
+       debugfs_remove(priv->path_dentry);
 }
 
 int ipoib_register_debugfs(void)
index 6a7003ddb0be19e665220e843391bd148eb8ded4..72626c3481749b962fe96b79722d7c8e9c99c585 100644 (file)
@@ -664,17 +664,18 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
        drain_tx_cq((struct net_device *)ctx);
 }
 
-int ipoib_ib_dev_open(struct net_device *dev)
+int ipoib_ib_dev_open(struct net_device *dev, int flush)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret;
 
-       if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
-               ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
-               clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+       ipoib_pkey_dev_check_presence(dev);
+
+       if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+               ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
+                          (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
                return -1;
        }
-       set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 
        ret = ipoib_init_qp(dev);
        if (ret) {
@@ -705,16 +706,17 @@ int ipoib_ib_dev_open(struct net_device *dev)
 dev_stop:
        if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
                napi_enable(&priv->napi);
-       ipoib_ib_dev_stop(dev, 1);
+       ipoib_ib_dev_stop(dev, flush);
        return -1;
 }
 
-static void ipoib_pkey_dev_check_presence(struct net_device *dev)
+void ipoib_pkey_dev_check_presence(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       u16 pkey_index = 0;
 
-       if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
+       if (!(priv->pkey & 0x7fff) ||
+           ib_find_pkey(priv->ca, priv->port, priv->pkey,
+                        &priv->pkey_index))
                clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
        else
                set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
@@ -745,14 +747,6 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
        clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
        netif_carrier_off(dev);
 
-       /* Shutdown the P_Key thread if still active */
-       if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
-               mutex_lock(&pkey_mutex);
-               set_bit(IPOIB_PKEY_STOP, &priv->flags);
-               cancel_delayed_work_sync(&priv->pkey_poll_task);
-               mutex_unlock(&pkey_mutex);
-       }
-
        ipoib_mcast_stop_thread(dev, flush);
        ipoib_mcast_dev_flush(dev);
 
@@ -924,7 +918,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
                    (unsigned long) dev);
 
        if (dev->flags & IFF_UP) {
-               if (ipoib_ib_dev_open(dev)) {
+               if (ipoib_ib_dev_open(dev, 1)) {
                        ipoib_transport_dev_cleanup(dev);
                        return -ENODEV;
                }
@@ -966,13 +960,27 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
 
        return 1;
 }
+/*
+ * returns 0 if pkey value was found in a different slot.
+ */
+static inline int update_child_pkey(struct ipoib_dev_priv *priv)
+{
+       u16 old_index = priv->pkey_index;
+
+       priv->pkey_index = 0;
+       ipoib_pkey_dev_check_presence(priv->dev);
+
+       if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+           (old_index == priv->pkey_index))
+               return 1;
+       return 0;
+}
 
 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
                                enum ipoib_flush_level level)
 {
        struct ipoib_dev_priv *cpriv;
        struct net_device *dev = priv->dev;
-       u16 new_index;
        int result;
 
        down_read(&priv->vlan_rwsem);
@@ -986,16 +994,20 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 
        up_read(&priv->vlan_rwsem);
 
-       if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
-               /* for non-child devices must check/update the pkey value here */
-               if (level == IPOIB_FLUSH_HEAVY &&
-                   !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
-                       update_parent_pkey(priv);
+       if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
+           level != IPOIB_FLUSH_HEAVY) {
                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
                return;
        }
 
        if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+               /* interface is down. update pkey and leave. */
+               if (level == IPOIB_FLUSH_HEAVY) {
+                       if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+                               update_parent_pkey(priv);
+                       else
+                               update_child_pkey(priv);
+               }
                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
                return;
        }
@@ -1005,20 +1017,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
                 * (parent) devices should always takes what present in pkey index 0
                 */
                if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
-                       if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
-                               clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
-                               ipoib_ib_dev_down(dev, 0);
-                               ipoib_ib_dev_stop(dev, 0);
-                               if (ipoib_pkey_dev_delay_open(dev))
-                                       return;
-                       }
-                       /* restart QP only if P_Key index is changed */
-                       if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
-                           new_index == priv->pkey_index) {
+                       result = update_child_pkey(priv);
+                       if (result) {
+                               /* restart QP only if P_Key index is changed */
                                ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
                                return;
                        }
-                       priv->pkey_index = new_index;
+
                } else {
                        result = update_parent_pkey(priv);
                        /* restart QP only if P_Key value changed */
@@ -1038,8 +1043,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
                ipoib_ib_dev_down(dev, 0);
 
        if (level == IPOIB_FLUSH_HEAVY) {
-               ipoib_ib_dev_stop(dev, 0);
-               ipoib_ib_dev_open(dev);
+               if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+                       ipoib_ib_dev_stop(dev, 0);
+               if (ipoib_ib_dev_open(dev, 0) != 0)
+                       return;
+               if (netif_queue_stopped(dev))
+                       netif_start_queue(dev);
        }
 
        /*
@@ -1094,54 +1103,4 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
        ipoib_transport_dev_cleanup(dev);
 }
 
-/*
- * Delayed P_Key Assigment Interim Support
- *
- * The following is initial implementation of delayed P_Key assigment
- * mechanism. It is using the same approach implemented for the multicast
- * group join. The single goal of this implementation is to quickly address
- * Bug #2507. This implementation will probably be removed when the P_Key
- * change async notification is available.
- */
-
-void ipoib_pkey_poll(struct work_struct *work)
-{
-       struct ipoib_dev_priv *priv =
-               container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
-       struct net_device *dev = priv->dev;
-
-       ipoib_pkey_dev_check_presence(dev);
-
-       if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
-               ipoib_open(dev);
-       else {
-               mutex_lock(&pkey_mutex);
-               if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
-                       queue_delayed_work(ipoib_workqueue,
-                                          &priv->pkey_poll_task,
-                                          HZ);
-               mutex_unlock(&pkey_mutex);
-       }
-}
-
-int ipoib_pkey_dev_delay_open(struct net_device *dev)
-{
-       struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-       /* Look for the interface pkey value in the IB Port P_Key table and */
-       /* set the interface pkey assigment flag                            */
-       ipoib_pkey_dev_check_presence(dev);
 
-       /* P_Key value not assigned yet - start polling */
-       if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
-               mutex_lock(&pkey_mutex);
-               clear_bit(IPOIB_PKEY_STOP, &priv->flags);
-               queue_delayed_work(ipoib_workqueue,
-                                  &priv->pkey_poll_task,
-                                  HZ);
-               mutex_unlock(&pkey_mutex);
-               return 1;
-       }
-
-       return 0;
-}
index 4e675f4fecc974447fde0171e9dfa94403d0f176..1310acf6bf923786a09692bed47b5fcb49ca7e45 100644 (file)
@@ -108,11 +108,11 @@ int ipoib_open(struct net_device *dev)
 
        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
-       if (ipoib_pkey_dev_delay_open(dev))
-               return 0;
-
-       if (ipoib_ib_dev_open(dev))
+       if (ipoib_ib_dev_open(dev, 1)) {
+               if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+                       return 0;
                goto err_disable;
+       }
 
        if (ipoib_ib_dev_up(dev))
                goto err_stop;
@@ -1379,7 +1379,6 @@ void ipoib_setup(struct net_device *dev)
        INIT_LIST_HEAD(&priv->dead_ahs);
        INIT_LIST_HEAD(&priv->multicast_list);
 
-       INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
index eb7973957a6ea35585a16086adc40e29df151cb9..61ee91d883806322f60c79eaae6ec1dde865ad54 100644 (file)
@@ -596,20 +596,28 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
        struct iser_conn *ib_conn;
        struct iscsi_endpoint *ep;
 
-       ep = iscsi_create_endpoint(sizeof(*ib_conn));
+       ep = iscsi_create_endpoint(0);
        if (!ep)
                return ERR_PTR(-ENOMEM);
 
-       ib_conn = ep->dd_data;
+       ib_conn = kzalloc(sizeof(*ib_conn), GFP_KERNEL);
+       if (!ib_conn) {
+               err = -ENOMEM;
+               goto failure;
+       }
+
+       ep->dd_data = ib_conn;
        ib_conn->ep = ep;
        iser_conn_init(ib_conn);
 
-       err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr,
-                          non_blocking);
+       err = iser_connect(ib_conn, NULL, dst_addr, non_blocking);
        if (err)
-               return ERR_PTR(err);
+               goto failure;
 
        return ep;
+failure:
+       iscsi_destroy_endpoint(ep);
+       return ERR_PTR(err);
 }
 
 static int
@@ -619,15 +627,16 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
        int rc;
 
        ib_conn = ep->dd_data;
-       rc = wait_event_interruptible_timeout(ib_conn->wait,
-                            ib_conn->state == ISER_CONN_UP,
-                            msecs_to_jiffies(timeout_ms));
-
+       rc = wait_for_completion_interruptible_timeout(&ib_conn->up_completion,
+                                                      msecs_to_jiffies(timeout_ms));
        /* if conn establishment failed, return error code to iscsi */
-       if (!rc &&
-           (ib_conn->state == ISER_CONN_TERMINATING ||
-            ib_conn->state == ISER_CONN_DOWN))
-               rc = -1;
+       if (rc == 0) {
+               mutex_lock(&ib_conn->state_mutex);
+               if (ib_conn->state == ISER_CONN_TERMINATING ||
+                   ib_conn->state == ISER_CONN_DOWN)
+                       rc = -1;
+               mutex_unlock(&ib_conn->state_mutex);
+       }
 
        iser_info("ib conn %p rc = %d\n", ib_conn, rc);
 
@@ -646,19 +655,25 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
 
        ib_conn = ep->dd_data;
        iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state);
+       mutex_lock(&ib_conn->state_mutex);
        iser_conn_terminate(ib_conn);
 
        /*
-        * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop
-        * call and ISER_CONN_DOWN state before freeing the iser resources.
-        * otherwise we are safe to free resources immediately.
+        * if iser_conn and iscsi_conn are bound, we must wait for
+        * iscsi_conn_stop and flush errors completion before freeing
+        * the iser resources. Otherwise we are safe to free resources
+        * immediately.
         */
        if (ib_conn->iscsi_conn) {
                INIT_WORK(&ib_conn->release_work, iser_release_work);
                queue_work(release_wq, &ib_conn->release_work);
+               mutex_unlock(&ib_conn->state_mutex);
        } else {
+               ib_conn->state = ISER_CONN_DOWN;
+               mutex_unlock(&ib_conn->state_mutex);
                iser_conn_release(ib_conn);
        }
+       iscsi_destroy_endpoint(ep);
 }
 
 static umode_t iser_attr_is_visible(int param_type, int param)
index 97cd385bf7f72c6d0fdc664e30a949b343564888..c877dad381cb95acb1979e158536b281265f5573 100644 (file)
@@ -326,7 +326,6 @@ struct iser_conn {
        struct iser_device           *device;       /* device context          */
        struct rdma_cm_id            *cma_id;       /* CMA ID                  */
        struct ib_qp                 *qp;           /* QP                      */
-       wait_queue_head_t            wait;          /* waitq for conn/disconn  */
        unsigned                     qp_max_recv_dtos; /* num of rx buffers */
        unsigned                     qp_max_recv_dtos_mask; /* above minus 1 */
        unsigned                     min_posted_rx; /* qp_max_recv_dtos >> 2 */
@@ -335,6 +334,9 @@ struct iser_conn {
        char                         name[ISER_OBJECT_NAME_SIZE];
        struct work_struct           release_work;
        struct completion            stop_completion;
+       struct mutex                 state_mutex;
+       struct completion            flush_completion;
+       struct completion            up_completion;
        struct list_head             conn_list;       /* entry in ig conn list */
 
        char                         *login_buf;
@@ -448,8 +450,8 @@ int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task,
                               enum iser_data_dir cmd_dir);
 
 int  iser_connect(struct iser_conn   *ib_conn,
-                 struct sockaddr_in *src_addr,
-                 struct sockaddr_in *dst_addr,
+                 struct sockaddr    *src_addr,
+                 struct sockaddr    *dst_addr,
                  int                non_blocking);
 
 int  iser_reg_page_vec(struct iser_conn     *ib_conn,
index ea01075f9f9b81b180ecfe85f02d4b18a77a494f..3ef167f97d6fd8468a266b3d45198ae72e30b198 100644 (file)
@@ -491,10 +491,9 @@ out_err:
 }
 
 /**
- * releases the QP objects, returns 0 on success,
- * -1 on failure
+ * releases the QP object
  */
-static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
+static void iser_free_ib_conn_res(struct iser_conn *ib_conn)
 {
        int cq_index;
        BUG_ON(ib_conn == NULL);
@@ -513,8 +512,6 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
        }
 
        ib_conn->qp       = NULL;
-
-       return 0;
 }
 
 /**
@@ -568,31 +565,40 @@ static void iser_device_try_release(struct iser_device *device)
        mutex_unlock(&ig.device_list_mutex);
 }
 
+/**
+ * Called with state mutex held
+ **/
 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
                                     enum iser_ib_conn_state comp,
                                     enum iser_ib_conn_state exch)
 {
        int ret;
 
-       spin_lock_bh(&ib_conn->lock);
        if ((ret = (ib_conn->state == comp)))
                ib_conn->state = exch;
-       spin_unlock_bh(&ib_conn->lock);
        return ret;
 }
 
 void iser_release_work(struct work_struct *work)
 {
        struct iser_conn *ib_conn;
+       int rc;
 
        ib_conn = container_of(work, struct iser_conn, release_work);
 
        /* wait for .conn_stop callback */
-       wait_for_completion(&ib_conn->stop_completion);
+       rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ);
+       WARN_ON(rc == 0);
 
        /* wait for the qp`s post send and post receive buffers to empty */
-       wait_event_interruptible(ib_conn->wait,
-                                ib_conn->state == ISER_CONN_DOWN);
+       rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ);
+       WARN_ON(rc == 0);
+
+       ib_conn->state = ISER_CONN_DOWN;
+
+       mutex_lock(&ib_conn->state_mutex);
+       ib_conn->state = ISER_CONN_DOWN;
+       mutex_unlock(&ib_conn->state_mutex);
 
        iser_conn_release(ib_conn);
 }
@@ -604,23 +610,27 @@ void iser_conn_release(struct iser_conn *ib_conn)
 {
        struct iser_device  *device = ib_conn->device;
 
-       BUG_ON(ib_conn->state == ISER_CONN_UP);
-
        mutex_lock(&ig.connlist_mutex);
        list_del(&ib_conn->conn_list);
        mutex_unlock(&ig.connlist_mutex);
+
+       mutex_lock(&ib_conn->state_mutex);
+       BUG_ON(ib_conn->state != ISER_CONN_DOWN);
+
        iser_free_rx_descriptors(ib_conn);
        iser_free_ib_conn_res(ib_conn);
        ib_conn->device = NULL;
        /* on EVENT_ADDR_ERROR there's no device yet for this conn */
        if (device != NULL)
                iser_device_try_release(device);
+       mutex_unlock(&ib_conn->state_mutex);
+
        /* if cma handler context, the caller actually destroy the id */
        if (ib_conn->cma_id != NULL) {
                rdma_destroy_id(ib_conn->cma_id);
                ib_conn->cma_id = NULL;
        }
-       iscsi_destroy_endpoint(ib_conn->ep);
+       kfree(ib_conn);
 }
 
 /**
@@ -642,22 +652,31 @@ void iser_conn_terminate(struct iser_conn *ib_conn)
                         ib_conn,err);
 }
 
+/**
+ * Called with state mutex held
+ **/
 static void iser_connect_error(struct rdma_cm_id *cma_id)
 {
        struct iser_conn *ib_conn;
 
        ib_conn = (struct iser_conn *)cma_id->context;
-
        ib_conn->state = ISER_CONN_DOWN;
-       wake_up_interruptible(&ib_conn->wait);
 }
 
+/**
+ * Called with state mutex held
+ **/
 static void iser_addr_handler(struct rdma_cm_id *cma_id)
 {
        struct iser_device *device;
        struct iser_conn   *ib_conn;
        int    ret;
 
+       ib_conn = (struct iser_conn *)cma_id->context;
+       if (ib_conn->state != ISER_CONN_PENDING)
+               /* bailout */
+               return;
+
        device = iser_device_find_by_ib_device(cma_id);
        if (!device) {
                iser_err("device lookup/creation failed\n");
@@ -665,7 +684,6 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
                return;
        }
 
-       ib_conn = (struct iser_conn *)cma_id->context;
        ib_conn->device = device;
 
        /* connection T10-PI support */
@@ -689,18 +707,27 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
        }
 }
 
+/**
+ * Called with state mutex held
+ **/
 static void iser_route_handler(struct rdma_cm_id *cma_id)
 {
        struct rdma_conn_param conn_param;
        int    ret;
        struct iser_cm_hdr req_hdr;
+       struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context;
+       struct iser_device *device = ib_conn->device;
+
+       if (ib_conn->state != ISER_CONN_PENDING)
+               /* bailout */
+               return;
 
        ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context);
        if (ret)
                goto failure;
 
        memset(&conn_param, 0, sizeof conn_param);
-       conn_param.responder_resources = 4;
+       conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
        conn_param.initiator_depth     = 1;
        conn_param.retry_count         = 7;
        conn_param.rnr_retry_count     = 6;
@@ -728,12 +755,16 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
        struct ib_qp_attr attr;
        struct ib_qp_init_attr init_attr;
 
+       ib_conn = (struct iser_conn *)cma_id->context;
+       if (ib_conn->state != ISER_CONN_PENDING)
+               /* bailout */
+               return;
+
        (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
        iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
 
-       ib_conn = (struct iser_conn *)cma_id->context;
-       if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP))
-               wake_up_interruptible(&ib_conn->wait);
+       ib_conn->state = ISER_CONN_UP;
+       complete(&ib_conn->up_completion);
 }
 
 static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
@@ -752,19 +783,25 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
                        iser_err("iscsi_iser connection isn't bound\n");
        }
 
-       /* Complete the termination process if no posts are pending */
+       /* Complete the termination process if no posts are pending. This code
+        * block also exists in iser_handle_comp_error(), but it is needed here
+        * for cases of no flushes at all, e.g. discovery over rdma.
+        */
        if (ib_conn->post_recv_buf_count == 0 &&
            (atomic_read(&ib_conn->post_send_buf_count) == 0)) {
-               ib_conn->state = ISER_CONN_DOWN;
-               wake_up_interruptible(&ib_conn->wait);
+               complete(&ib_conn->flush_completion);
        }
 }
 
 static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
+       struct iser_conn *ib_conn;
+
+       ib_conn = (struct iser_conn *)cma_id->context;
        iser_info("event %d status %d conn %p id %p\n",
                  event->event, event->status, cma_id->context, cma_id);
 
+       mutex_lock(&ib_conn->state_mutex);
        switch (event->event) {
        case RDMA_CM_EVENT_ADDR_RESOLVED:
                iser_addr_handler(cma_id);
@@ -785,24 +822,28 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
        case RDMA_CM_EVENT_DISCONNECTED:
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
        case RDMA_CM_EVENT_ADDR_CHANGE:
+       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
                iser_disconnected_handler(cma_id);
                break;
        default:
                iser_err("Unexpected RDMA CM event (%d)\n", event->event);
                break;
        }
+       mutex_unlock(&ib_conn->state_mutex);
        return 0;
 }
 
 void iser_conn_init(struct iser_conn *ib_conn)
 {
        ib_conn->state = ISER_CONN_INIT;
-       init_waitqueue_head(&ib_conn->wait);
        ib_conn->post_recv_buf_count = 0;
        atomic_set(&ib_conn->post_send_buf_count, 0);
        init_completion(&ib_conn->stop_completion);
+       init_completion(&ib_conn->flush_completion);
+       init_completion(&ib_conn->up_completion);
        INIT_LIST_HEAD(&ib_conn->conn_list);
        spin_lock_init(&ib_conn->lock);
+       mutex_init(&ib_conn->state_mutex);
 }
 
  /**
@@ -810,22 +851,21 @@ void iser_conn_init(struct iser_conn *ib_conn)
  * sleeps until the connection is established or rejected
  */
 int iser_connect(struct iser_conn   *ib_conn,
-                struct sockaddr_in *src_addr,
-                struct sockaddr_in *dst_addr,
+                struct sockaddr    *src_addr,
+                struct sockaddr    *dst_addr,
                 int                 non_blocking)
 {
-       struct sockaddr *src, *dst;
        int err = 0;
 
-       sprintf(ib_conn->name, "%pI4:%d",
-               &dst_addr->sin_addr.s_addr, dst_addr->sin_port);
+       mutex_lock(&ib_conn->state_mutex);
+
+       sprintf(ib_conn->name, "%pISp", dst_addr);
+
+       iser_info("connecting to: %s\n", ib_conn->name);
 
        /* the device is known only --after-- address resolution */
        ib_conn->device = NULL;
 
-       iser_info("connecting to: %pI4, port 0x%x\n",
-                 &dst_addr->sin_addr, dst_addr->sin_port);
-
        ib_conn->state = ISER_CONN_PENDING;
 
        ib_conn->cma_id = rdma_create_id(iser_cma_handler,
@@ -837,23 +877,21 @@ int iser_connect(struct iser_conn   *ib_conn,
                goto id_failure;
        }
 
-       src = (struct sockaddr *)src_addr;
-       dst = (struct sockaddr *)dst_addr;
-       err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000);
+       err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000);
        if (err) {
                iser_err("rdma_resolve_addr failed: %d\n", err);
                goto addr_failure;
        }
 
        if (!non_blocking) {
-               wait_event_interruptible(ib_conn->wait,
-                                        (ib_conn->state != ISER_CONN_PENDING));
+               wait_for_completion_interruptible(&ib_conn->up_completion);
 
                if (ib_conn->state != ISER_CONN_UP) {
                        err =  -EIO;
                        goto connect_failure;
                }
        }
+       mutex_unlock(&ib_conn->state_mutex);
 
        mutex_lock(&ig.connlist_mutex);
        list_add(&ib_conn->conn_list, &ig.connlist);
@@ -865,6 +903,7 @@ id_failure:
 addr_failure:
        ib_conn->state = ISER_CONN_DOWN;
 connect_failure:
+       mutex_unlock(&ib_conn->state_mutex);
        iser_conn_release(ib_conn);
        return err;
 }
@@ -1049,18 +1088,19 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc,
 
        if (ib_conn->post_recv_buf_count == 0 &&
            atomic_read(&ib_conn->post_send_buf_count) == 0) {
-               /* getting here when the state is UP means that the conn is *
-                * being terminated asynchronously from the iSCSI layer's   *
-                * perspective.                                             */
-               if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,
-                   ISER_CONN_TERMINATING))
+               /**
+                * getting here when the state is UP means that the conn is
+                * being terminated asynchronously from the iSCSI layer's
+                * perspective. It is safe to peek at the connection state
+                * since iscsi_conn_failure is allowed to be called twice.
+                **/
+               if (ib_conn->state == ISER_CONN_UP)
                        iscsi_conn_failure(ib_conn->iscsi_conn,
                                           ISCSI_ERR_CONN_FAILED);
 
                /* no more non completed posts to the QP, complete the
                 * termination process w.o worrying on disconnect event */
-               ib_conn->state = ISER_CONN_DOWN;
-               wake_up_interruptible(&ib_conn->wait);
+               complete(&ib_conn->flush_completion);
        }
 }
 
index e3c2c5b4297f69d033629717a87dea1f1fb6fdf9..62d2a18e1b419225b312326690ec624563f5aa7b 100644 (file)
@@ -130,6 +130,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
+static struct workqueue_struct *srp_remove_wq;
 
 static struct ib_client srp_client = {
        .name   = "srp",
@@ -731,7 +732,7 @@ static bool srp_queue_remove_work(struct srp_target_port *target)
        spin_unlock_irq(&target->lock);
 
        if (changed)
-               queue_work(system_long_wq, &target->remove_work);
+               queue_work(srp_remove_wq, &target->remove_work);
 
        return changed;
 }
@@ -1643,10 +1644,14 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)
                                     SCSI_SENSE_BUFFERSIZE));
                }
 
-               if (rsp->flags & (SRP_RSP_FLAG_DOOVER | SRP_RSP_FLAG_DOUNDER))
-                       scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt));
-               else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER))
+               if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER))
                        scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt));
+               else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER))
+                       scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt));
+               else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER))
+                       scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt));
+               else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER))
+                       scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt));
 
                srp_free_req(target, req, scmnd,
                             be32_to_cpu(rsp->req_lim_delta));
@@ -3261,9 +3266,10 @@ static void srp_remove_one(struct ib_device *device)
                spin_unlock(&host->target_lock);
 
                /*
-                * Wait for target port removal tasks.
+                * Wait for tl_err and target port removal tasks.
                 */
                flush_workqueue(system_long_wq);
+               flush_workqueue(srp_remove_wq);
 
                kfree(host);
        }
@@ -3313,16 +3319,22 @@ static int __init srp_init_module(void)
                indirect_sg_entries = cmd_sg_entries;
        }
 
+       srp_remove_wq = create_workqueue("srp_remove");
+       if (!srp_remove_wq) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = -ENOMEM;
        ib_srp_transport_template =
                srp_attach_transport(&ib_srp_transport_functions);
        if (!ib_srp_transport_template)
-               return -ENOMEM;
+               goto destroy_wq;
 
        ret = class_register(&srp_class);
        if (ret) {
                pr_err("couldn't register class infiniband_srp\n");
-               srp_release_transport(ib_srp_transport_template);
-               return ret;
+               goto release_tr;
        }
 
        ib_sa_register_client(&srp_sa_client);
@@ -3330,13 +3342,22 @@ static int __init srp_init_module(void)
        ret = ib_register_client(&srp_client);
        if (ret) {
                pr_err("couldn't register IB client\n");
-               srp_release_transport(ib_srp_transport_template);
-               ib_sa_unregister_client(&srp_sa_client);
-               class_unregister(&srp_class);
-               return ret;
+               goto unreg_sa;
        }
 
-       return 0;
+out:
+       return ret;
+
+unreg_sa:
+       ib_sa_unregister_client(&srp_sa_client);
+       class_unregister(&srp_class);
+
+release_tr:
+       srp_release_transport(ib_srp_transport_template);
+
+destroy_wq:
+       destroy_workqueue(srp_remove_wq);
+       goto out;
 }
 
 static void __exit srp_cleanup_module(void)
@@ -3345,6 +3366,7 @@ static void __exit srp_cleanup_module(void)
        ib_sa_unregister_client(&srp_sa_client);
        class_unregister(&srp_class);
        srp_release_transport(ib_srp_transport_template);
+       destroy_workqueue(srp_remove_wq);
 }
 
 module_init(srp_init_module);
index fe09f2788b15e7e42b1c076f67e17b3b2ca2f268..d28a8c284da90d57092577f648964d7bdb5731d1 100644 (file)
@@ -198,6 +198,7 @@ static void srpt_event_handler(struct ib_event_handler *handler,
        case IB_EVENT_PKEY_CHANGE:
        case IB_EVENT_SM_CHANGE:
        case IB_EVENT_CLIENT_REREGISTER:
+       case IB_EVENT_GID_CHANGE:
                /* Refresh port data asynchronously. */
                if (event->element.port_num <= sdev->device->phys_port_cnt) {
                        sport = &sdev->port[event->element.port_num - 1];
@@ -563,7 +564,7 @@ static int srpt_refresh_port(struct srpt_port *sport)
                                                         &reg_req, 0,
                                                         srpt_mad_send_handler,
                                                         srpt_mad_recv_handler,
-                                                        sport);
+                                                        sport, 0);
                if (IS_ERR(sport->mad_agent)) {
                        ret = PTR_ERR(sport->mad_agent);
                        sport->mad_agent = NULL;
index 443d03fbac4705bd97f5acf77f622e0562b7fc55..8eeab72b93e2c4fcf6573e6a63d768f40f8e5320 100644 (file)
@@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg)
                                mutex_unlock(&ca->set->bucket_lock);
                                blkdev_issue_discard(ca->bdev,
                                        bucket_to_sector(ca->set, bucket),
-                                       ca->sb.block_size, GFP_KERNEL, 0);
+                                       ca->sb.bucket_size, GFP_KERNEL, 0);
                                mutex_lock(&ca->set->bucket_lock);
                        }
 
index d2ebcf3230942ab7872025c9fbf177ae515aded4..04f7bc28ef832b6dded6d10e810ddbfbfada4fca 100644 (file)
@@ -477,9 +477,13 @@ struct gc_stat {
  * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
  * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
  * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
  */
 #define CACHE_SET_UNREGISTERING                0
 #define        CACHE_SET_STOPPING              1
+#define        CACHE_SET_RUNNING               2
 
 struct cache_set {
        struct closure          cl;
index 54541641530569c442f7113b687428fad4bb18d6..646fe85261c17bcfb43ff89b54586838a00c39e7 100644 (file)
@@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 {
        uint64_t start_time;
        bool used_mempool = false;
-       struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+       struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
                                                     order);
        if (!out) {
                struct page *outp;
index 5f6728d5d4ddb0f406b0b7296025f412f193793b..ae964624efb248d59f9af530c44b68175ac4324e 100644 (file)
@@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l,
 {
        return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
                KEY_PTRS(l) == KEY_PTRS(r) &&
-               KEY_CSUM(l) == KEY_CSUM(l));
+               KEY_CSUM(l) == KEY_CSUM(r));
 }
 
 /* Keylists */
index 7347b61009615089e307fba6b5964fcb83ad2040..00cde40db57269bb173104ab3632b8a99e95c4f6 100644 (file)
 ({                                                                     \
        int _r, l = (b)->level - 1;                                     \
        bool _w = l <= (op)->lock;                                      \
-       struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+       struct btree *_child = bch_btree_node_get((b)->c, op, key, l,   \
+                                                 _w, b);               \
        if (!IS_ERR(_child)) {                                          \
-               _child->parent = (b);                                   \
                _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);       \
                rw_unlock(_w, _child);                                  \
        } else                                                          \
                rw_lock(_w, _b, _b->level);                             \
                if (_b == (c)->root &&                                  \
                    _w == insert_lock(op, _b)) {                        \
-                       _b->parent = NULL;                              \
                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                }                                                       \
                rw_unlock(_w, _b);                                      \
@@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b)
        struct bset *i = btree_bset_first(b);
        struct btree_iter *iter;
 
-       iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+       iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
        iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
        iter->used = 0;
 
@@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b)
        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
                       bset_sector_offset(&b->keys, i));
 
-       if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+       if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
                int j;
                struct bio_vec *bv;
                void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -967,7 +966,8 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
-                                struct bkey *k, int level, bool write)
+                                struct bkey *k, int level, bool write,
+                                struct btree *parent)
 {
        int i = 0;
        struct btree *b;
@@ -1002,6 +1002,7 @@ retry:
                BUG_ON(b->level != level);
        }
 
+       b->parent = parent;
        b->accessed = 1;
 
        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
@@ -1022,15 +1023,16 @@ retry:
        return b;
 }
 
-static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
 {
        struct btree *b;
 
-       mutex_lock(&c->bucket_lock);
-       b = mca_alloc(c, NULL, k, level);
-       mutex_unlock(&c->bucket_lock);
+       mutex_lock(&parent->c->bucket_lock);
+       b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+       mutex_unlock(&parent->c->bucket_lock);
 
        if (!IS_ERR_OR_NULL(b)) {
+               b->parent = parent;
                bch_btree_node_read(b);
                rw_unlock(true, b);
        }
@@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b)
        mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
-                                  int level)
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+                                    int level, bool wait,
+                                    struct btree *parent)
 {
        BKEY_PADDED(key) k;
        struct btree *b = ERR_PTR(-EAGAIN);
 
        mutex_lock(&c->bucket_lock);
 retry:
-       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
                goto err;
 
        bkey_put(c, &k.key);
@@ -1085,6 +1088,7 @@ retry:
        }
 
        b->accessed = 1;
+       b->parent = parent;
        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
        mutex_unlock(&c->bucket_lock);
@@ -1096,14 +1100,21 @@ err_free:
 err:
        mutex_unlock(&c->bucket_lock);
 
-       trace_bcache_btree_node_alloc_fail(b);
+       trace_bcache_btree_node_alloc_fail(c);
        return b;
 }
 
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+                                         struct btree_op *op, int level,
+                                         struct btree *parent)
+{
+       return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
+}
+
 static struct btree *btree_node_alloc_replacement(struct btree *b,
                                                  struct btree_op *op)
 {
-       struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+       struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
        if (!IS_ERR_OR_NULL(n)) {
                mutex_lock(&n->write_lock);
                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        BUG_ON(btree_bset_first(new_nodes[0])->keys);
        btree_node_free(new_nodes[0]);
        rw_unlock(true, new_nodes[0]);
+       new_nodes[0] = NULL;
 
        for (i = 0; i < nodes; i++) {
                if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
@@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                if (k) {
                        r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
-                                                 true);
+                                                 true, b);
                        if (IS_ERR(r->b)) {
                                ret = PTR_ERR(r->b);
                                break;
@@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
                        k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                       bch_ptr_bad);
                        if (k)
-                               btree_node_prefetch(b->c, k, b->level - 1);
+                               btree_node_prefetch(b, k);
 
                        if (p)
                                ret = btree(check_recurse, p, b, op);
@@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
-               n2 = bch_btree_node_alloc(b->c, op, b->level);
+               n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
                if (IS_ERR(n2))
                        goto err_free1;
 
                if (!b->parent) {
-                       n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+                       n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
                        if (IS_ERR(n3))
                                goto err_free2;
                }
index 91dfa5e696857ded36b3de7b66a8f862d9540cea..5c391fa01bedbfba3f1dea062605460ccadc1c6a 100644 (file)
@@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+                                    int, bool, struct btree *);
 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
-                                struct bkey *, int, bool);
+                                struct bkey *, int, bool, struct btree *);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
                               struct bkey *);
index 3a0de4cf9771031e9d4707fc49f356aee48b50e7..243de0bf15cdbf359965dff55cac4762b804362f 100644 (file)
@@ -474,9 +474,8 @@ out:
        return false;
 }
 
-static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 {
-       struct btree *b = container_of(bk, struct btree, keys);
        char buf[80];
 
        if (!KEY_SIZE(k))
@@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
        if (KEY_SIZE(k) > KEY_OFFSET(k))
                goto bad;
 
-       if (__ptr_invalid(b->c, k))
+       if (__ptr_invalid(c, k))
                goto bad;
 
        return false;
 bad:
        bch_extent_to_text(buf, sizeof(buf), k);
-       cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+       cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
        return true;
 }
 
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       return __bch_extent_invalid(b->c, k);
+}
+
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
                                     unsigned ptr)
 {
index e4e23409782d1957baacc8022e525d65aba7f094..e2ed54054e7a9106b50d0d2a3f6eb7967ce4e494 100644 (file)
@@ -9,5 +9,6 @@ struct cache_set;
 
 void bch_extent_to_text(char *, size_t, const struct bkey *);
 bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
 
 #endif /* _BCACHE_EXTENTS_H */
index 59e82021b5bb320d9c03606e35e0d5347417f1ff..fe080ad0e55841e5c95bfcb3dcf3a0f1a703b76c 100644 (file)
@@ -7,6 +7,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 
 #include <trace/events/bcache.h>
 
@@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
                        if (read_bucket(l))
                                goto bsearch;
 
-               if (list_empty(list))
+               /* no journal entries on this device? */
+               if (l == ca->sb.njournal_buckets)
                        continue;
 bsearch:
+               BUG_ON(list_empty(list));
+
                /* Binary search */
-               m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+               m = l;
+               r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
                pr_debug("starting binary search, l %u r %u", l, r);
 
                while (l + 1 < r) {
@@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 
                for (k = i->j.start;
                     k < bset_bkey_last(&i->j);
-                    k = bkey_next(k)) {
-                       unsigned j;
+                    k = bkey_next(k))
+                       if (!__bch_extent_invalid(c, k)) {
+                               unsigned j;
 
-                       for (j = 0; j < KEY_PTRS(k); j++)
-                               if (ptr_available(c, k, j))
-                                       atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+                               for (j = 0; j < KEY_PTRS(k); j++)
+                                       if (ptr_available(c, k, j))
+                                               atomic_inc(&PTR_BUCKET(c, k, j)->pin);
 
-                       bch_initial_mark_key(c, 0, k);
-               }
+                               bch_initial_mark_key(c, 0, k);
+                       }
        }
 }
 
index 15fff4f68a7ce75f441a1e429d961eac2d2b0d6e..62e6e98186b5cd536d75a9bec14a2c374102abbe 100644 (file)
@@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl)
 {
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-       trace_bcache_write(op->bio, op->writeback, op->bypass);
+       trace_bcache_write(op->c, op->inode, op->bio,
+                          op->writeback, op->bypass);
 
        bch_keylist_init(&op->insert_keys);
        bio_get(op->bio);
index 926ded8ccbf58c39788a471dffb4ed7f876028bc..d4713d098a397c2f1b124f9fdd240d9907fa16d6 100644 (file)
@@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d)
 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
                                 unsigned id)
 {
-       BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
-
        d->id = id;
        d->c = c;
        c->devices[id] = d;
@@ -927,6 +925,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
        list_move(&dc->list, &uncached_devices);
 
        clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+       clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 
        mutex_unlock(&bch_register_lock);
 
@@ -1041,6 +1040,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
         */
        atomic_set(&dc->count, 1);
 
+       if (bch_cached_dev_writeback_start(dc))
+               return -ENOMEM;
+
        if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
                bch_sectors_dirty_init(dc);
                atomic_set(&dc->has_dirty, 1);
@@ -1070,7 +1072,8 @@ static void cached_dev_free(struct closure *cl)
        struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
        cancel_delayed_work_sync(&dc->writeback_rate_update);
-       kthread_stop(dc->writeback_thread);
+       if (!IS_ERR_OR_NULL(dc->writeback_thread))
+               kthread_stop(dc->writeback_thread);
 
        mutex_lock(&bch_register_lock);
 
@@ -1081,12 +1084,8 @@ static void cached_dev_free(struct closure *cl)
 
        mutex_unlock(&bch_register_lock);
 
-       if (!IS_ERR_OR_NULL(dc->bdev)) {
-               if (dc->bdev->bd_disk)
-                       blk_sync_queue(bdev_get_queue(dc->bdev));
-
+       if (!IS_ERR_OR_NULL(dc->bdev))
                blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-       }
 
        wake_up(&unregister_wait);
 
@@ -1213,7 +1212,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 static void flash_dev_free(struct closure *cl)
 {
        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+       mutex_lock(&bch_register_lock);
        bcache_device_free(d);
+       mutex_unlock(&bch_register_lock);
        kobject_put(&d->kobj);
 }
 
@@ -1221,7 +1222,9 @@ static void flash_dev_flush(struct closure *cl)
 {
        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
 
+       mutex_lock(&bch_register_lock);
        bcache_device_unlink(d);
+       mutex_unlock(&bch_register_lock);
        kobject_del(&d->kobj);
        continue_at(cl, flash_dev_free, system_wq);
 }
@@ -1277,6 +1280,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
        if (test_bit(CACHE_SET_STOPPING, &c->flags))
                return -EINTR;
 
+       if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+               return -EPERM;
+
        u = uuid_find_empty(c);
        if (!u) {
                pr_err("Can't create volume, no room for UUID");
@@ -1346,8 +1352,11 @@ static void cache_set_free(struct closure *cl)
        bch_journal_free(c);
 
        for_each_cache(ca, c, i)
-               if (ca)
+               if (ca) {
+                       ca->set = NULL;
+                       c->cache[ca->sb.nr_this_dev] = NULL;
                        kobject_put(&ca->kobj);
+               }
 
        bch_bset_sort_state_free(&c->sort);
        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
@@ -1405,9 +1414,11 @@ static void cache_set_flush(struct closure *cl)
                if (ca->alloc_thread)
                        kthread_stop(ca->alloc_thread);
 
-       cancel_delayed_work_sync(&c->journal.work);
-       /* flush last journal entry if needed */
-       c->journal.work.work.func(&c->journal.work.work);
+       if (c->journal.cur) {
+               cancel_delayed_work_sync(&c->journal.work);
+               /* flush last journal entry if needed */
+               c->journal.work.work.func(&c->journal.work.work);
+       }
 
        closure_return(cl);
 }
@@ -1586,7 +1597,7 @@ static void run_cache_set(struct cache_set *c)
                        goto err;
 
                err = "error reading btree root";
-               c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+               c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
                if (IS_ERR_OR_NULL(c->root))
                        goto err;
 
@@ -1661,7 +1672,7 @@ static void run_cache_set(struct cache_set *c)
                        goto err;
 
                err = "cannot allocate new btree root";
-               c->root = bch_btree_node_alloc(c, NULL, 0);
+               c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
                if (IS_ERR_OR_NULL(c->root))
                        goto err;
 
@@ -1697,6 +1708,7 @@ static void run_cache_set(struct cache_set *c)
 
        flash_devs_run(c);
 
+       set_bit(CACHE_SET_RUNNING, &c->flags);
        return;
 err:
        closure_sync(&cl);
@@ -1760,6 +1772,7 @@ found:
                pr_debug("set version = %llu", c->sb.version);
        }
 
+       kobject_get(&ca->kobj);
        ca->set = c;
        ca->set->cache[ca->sb.nr_this_dev] = ca;
        c->cache_by_alloc[c->caches_loaded++] = ca;
@@ -1780,8 +1793,10 @@ void bch_cache_release(struct kobject *kobj)
        struct cache *ca = container_of(kobj, struct cache, kobj);
        unsigned i;
 
-       if (ca->set)
+       if (ca->set) {
+               BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
                ca->set->cache[ca->sb.nr_this_dev] = NULL;
+       }
 
        bio_split_pool_free(&ca->bio_split_hook);
 
@@ -1798,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj)
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
 
-       if (!IS_ERR_OR_NULL(ca->bdev)) {
-               blk_sync_queue(bdev_get_queue(ca->bdev));
+       if (!IS_ERR_OR_NULL(ca->bdev))
                blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-       }
 
        kfree(ca);
        module_put(THIS_MODULE);
@@ -1844,7 +1857,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 }
 
 static void register_cache(struct cache_sb *sb, struct page *sb_page,
-                                 struct block_device *bdev, struct cache *ca)
+                               struct block_device *bdev, struct cache *ca)
 {
        char name[BDEVNAME_SIZE];
        const char *err = "cannot allocate memory";
@@ -1877,10 +1890,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
                goto err;
 
        pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+       kobject_put(&ca->kobj);
        return;
 err:
        pr_notice("error opening %s: %s", bdevname(bdev, name), err);
-       kobject_put(&ca->kobj);
+       goto out;
 }
 
 /* Global interfaces/init */
@@ -1945,10 +1960,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
        if (IS_ERR(bdev)) {
                if (bdev == ERR_PTR(-EBUSY)) {
                        bdev = lookup_bdev(strim(path));
+                       mutex_lock(&bch_register_lock);
                        if (!IS_ERR(bdev) && bch_is_open(bdev))
                                err = "device already registered";
                        else
                                err = "device busy";
+                       mutex_unlock(&bch_register_lock);
                }
                goto err;
        }
index ac7d0d1f70d7be9ae818a51c6d77c461d770eb4b..98df7572b5f7f82b9091965e199a301159374bee 100644 (file)
@@ -416,8 +416,8 @@ do {                                                                        \
                          average_frequency,    frequency_units);       \
        __print_time_stat(stats, name,                                  \
                          average_duration,     duration_units);        \
-       __print_time_stat(stats, name,                                  \
-                         max_duration,         duration_units);        \
+       sysfs_print(name ## _ ##max_duration ## _ ## duration_units,    \
+                       div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
                                                                        \
        sysfs_print(name ## _last_ ## frequency_units, (stats)->last    \
                    ? div_s64(local_clock() - (stats)->last,            \
index f4300e4c0114a0cc1abc3b90f757a03666d2637b..f1986bcd1bf05e1058e26946c600a6e2c1f5991d 100644 (file)
@@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
                if (KEY_START(&w->key) != dc->last_read ||
                    jiffies_to_msecs(delay) > 50)
                        while (!kthread_should_stop() && delay)
-                               delay = schedule_timeout_uninterruptible(delay);
+                               delay = schedule_timeout_interruptible(delay);
 
                dc->last_read   = KEY_OFFSET(&w->key);
 
@@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg)
                        while (delay &&
                               !kthread_should_stop() &&
                               !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
-                               delay = schedule_timeout_uninterruptible(delay);
+                               delay = schedule_timeout_interruptible(delay);
                }
        }
 
@@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
        dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
 }
 
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
        sema_init(&dc->in_flight, 64);
        init_rwsem(&dc->writeback_lock);
@@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
        dc->writeback_rate_d_term       = 30;
        dc->writeback_rate_p_term_inverse = 6000;
 
+       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
        dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
                                              "bcache_writeback");
        if (IS_ERR(dc->writeback_thread))
                return PTR_ERR(dc->writeback_thread);
 
-       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
        schedule_delayed_work(&dc->writeback_rate_update,
                              dc->writeback_rate_update_seconds * HZ);
 
+       bch_writeback_queue(dc);
+
        return 0;
 }
index e2f8598937ac41ff5c7577bc5e65aeb39de95386..0a9dab187b79c7ef0a4429c4616a6985d320b964 100644 (file)
@@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-int bch_cached_dev_writeback_init(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
 
 #endif
index d2899e7eb3aaf317a93d91978936dac5e3c7f132..06709257adde39e84c3ddd2aa0def729b974ba73 100644 (file)
@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
        disk_super->discard_root = cpu_to_le64(cmd->discard_root);
        disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
        disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
-       disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+       disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
        disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
        disk_super->cache_blocks = cpu_to_le32(0);
 
@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
                                            bool may_format_device)
 {
        int r;
-       cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+       cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
                                          CACHE_METADATA_CACHE_SIZE,
                                          CACHE_MAX_CONCURRENT_LOCKS);
        if (IS_ERR(cmd->bm)) {
index cd70a78623a336956a6366a4de1b2a912821dd9a..7383c90ccdb809e72d904e4a14a642a3d34e1ed1 100644 (file)
@@ -9,19 +9,17 @@
 
 #include "dm-cache-block-types.h"
 #include "dm-cache-policy-internal.h"
+#include "persistent-data/dm-space-map-metadata.h"
 
 /*----------------------------------------------------------------*/
 
-#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 
 /* FIXME: remove this restriction */
 /*
  * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
  */
-#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 
 /*
  * A metadata device larger than 16GB triggers a warning.
index 2c63326638b6d4d54af4499643ac10dd9d8ee33b..1af40ee209e2b9c0d46b9873f3bc32fd78234cd7 100644 (file)
@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
        return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 }
 
+/*
+ * You must increment the deferred set whilst the prison cell is held.  To
+ * encourage this, we ask for 'cell' to be passed in.
+ */
+static void inc_ds(struct cache *cache, struct bio *bio,
+                  struct dm_bio_prison_cell *cell)
+{
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+       BUG_ON(!cell);
+       BUG_ON(pb->all_io_entry);
+
+       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
        spin_unlock_irqrestore(&cache->lock, flags);
 }
 
+static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+{
+       inc_ds(cache, bio, cell);
+       issue(cache, bio);
+}
+
 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 
        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
        remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+
+       /*
+        * No need to inc_ds() here, since the cell will be held for the
+        * duration of the io.
+        */
        generic_make_request(bio);
 }
 
@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
                return;
 
        INIT_LIST_HEAD(&work);
-       if (pb->all_io_entry)
-               dm_deferred_entry_dec(pb->all_io_entry, &work);
+       dm_deferred_entry_dec(pb->all_io_entry, &work);
 
        if (!list_empty(&work))
                queue_quiesced_migrations(cache, &work);
@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
        else
                remap_to_cache(cache, bio, 0);
 
+       /*
+        * REQ_FLUSH is not directed at any particular block so we don't
+        * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
+        * by dm-core.
+        */
        issue(cache, bio);
 }
 
@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
                   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
-static void issue_cache_bio(struct cache *cache, struct bio *bio,
-                           struct per_bio_data *pb,
-                           dm_oblock_t oblock, dm_cblock_t cblock)
-{
-       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-       remap_to_cache_dirty(cache, bio, oblock, cblock);
-       issue(cache, bio);
-}
-
 static void process_bio(struct cache *cache, struct prealloc *structs,
                        struct bio *bio)
 {
@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
        dm_oblock_t block = get_bio_block(cache, bio);
        struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
        struct policy_result lookup_result;
-       size_t pb_data_size = get_per_bio_data_size(cache);
-       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        bool discarded_block = is_discarded_oblock(cache, block);
        bool passthrough = passthrough_mode(&cache->features);
        bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
                        } else {
                                /* FIXME: factor out issue_origin() */
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                remap_to_origin_clear_discard(cache, bio, block);
-                               issue(cache, bio);
+                               inc_and_issue(cache, bio, new_ocell);
                        }
                } else {
                        inc_hit_counter(cache, bio);
@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
                        if (bio_data_dir(bio) == WRITE &&
                            writethrough_mode(&cache->features) &&
                            !is_dirty(cache, lookup_result.cblock)) {
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                               issue(cache, bio);
-                       } else
-                               issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+                               inc_and_issue(cache, bio, new_ocell);
+
+                       } else  {
+                               remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                               inc_and_issue(cache, bio, new_ocell);
+                       }
                }
 
                break;
 
        case POLICY_MISS:
                inc_miss_counter(cache, bio);
-               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                remap_to_origin_clear_discard(cache, bio, block);
-               issue(cache, bio);
+               inc_and_issue(cache, bio, new_ocell);
                break;
 
        case POLICY_NEW:
@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
        bio_list_init(&cache->deferred_flush_bios);
        spin_unlock_irqrestore(&cache->lock, flags);
 
+       /*
+        * These bios have already been through inc_ds()
+        */
        while ((bio = bio_list_pop(&bios)))
                submit_bios ? generic_make_request(bio) : bio_io_error(bio);
 }
@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
        bio_list_init(&cache->deferred_writethrough_bios);
        spin_unlock_irqrestore(&cache->lock, flags);
 
+       /*
+        * These bios have already been through inc_ds()
+        */
        while ((bio = bio_list_pop(&bios)))
                generic_make_request(bio);
 }
@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
 
                if (commit_if_needed(cache)) {
                        process_deferred_flush_bios(cache, false);
+                       process_migrations(cache, &cache->need_commit_migrations, migration_failure);
 
                        /*
                         * FIXME: rollback metadata or just go into a
@@ -2406,16 +2433,13 @@ out:
        return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
+static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
 {
-       struct cache *cache = ti->private;
-
        int r;
        dm_oblock_t block = get_bio_block(cache, bio);
        size_t pb_data_size = get_per_bio_data_size(cache);
        bool can_migrate = false;
        bool discarded_block;
-       struct dm_bio_prison_cell *cell;
        struct policy_result lookup_result;
        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 
@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        /*
         * Check to see if that block is currently migrating.
         */
-       cell = alloc_prison_cell(cache);
-       if (!cell) {
+       *cell = alloc_prison_cell(cache);
+       if (!*cell) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
        }
 
-       r = bio_detain(cache, block, bio, cell,
+       r = bio_detain(cache, block, bio, *cell,
                       (cell_free_fn) free_prison_cell,
-                      cache, &cell);
+                      cache, cell);
        if (r) {
                if (r < 0)
                        defer_bio(cache, bio);
@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
                       bio, &lookup_result);
        if (r == -EWOULDBLOCK) {
-               cell_defer(cache, cell, true);
+               cell_defer(cache, *cell, true);
                return DM_MAPIO_SUBMITTED;
 
        } else if (r) {
                DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+               cell_defer(cache, *cell, false);
                bio_io_error(bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
                                 * We need to invalidate this block, so
                                 * defer for the worker thread.
                                 */
-                               cell_defer(cache, cell, true);
+                               cell_defer(cache, *cell, true);
                                r = DM_MAPIO_SUBMITTED;
 
                        } else {
-                               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
                                inc_miss_counter(cache, bio);
                                remap_to_origin_clear_discard(cache, bio, block);
-
-                               cell_defer(cache, cell, false);
                        }
 
                } else {
                        inc_hit_counter(cache, bio);
-                       pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
                            !is_dirty(cache, lookup_result.cblock))
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
                        else
                                remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-
-                       cell_defer(cache, cell, false);
                }
                break;
 
        case POLICY_MISS:
                inc_miss_counter(cache, bio);
-               pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
                if (pb->req_nr != 0) {
                        /*
                         * This is a duplicate writethrough io that is no
                         * longer needed because the block has been demoted.
                         */
                        bio_endio(bio, 0);
-                       cell_defer(cache, cell, false);
-                       return DM_MAPIO_SUBMITTED;
-               } else {
+                       cell_defer(cache, *cell, false);
+                       r = DM_MAPIO_SUBMITTED;
+
+               } else
                        remap_to_origin_clear_discard(cache, bio, block);
-                       cell_defer(cache, cell, false);
-               }
+
                break;
 
        default:
                DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
                            (unsigned) lookup_result.op);
+               cell_defer(cache, *cell, false);
                bio_io_error(bio);
                r = DM_MAPIO_SUBMITTED;
        }
@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        return r;
 }
 
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+       int r;
+       struct dm_bio_prison_cell *cell;
+       struct cache *cache = ti->private;
+
+       r = __cache_map(cache, bio, &cell);
+       if (r == DM_MAPIO_REMAPPED) {
+               inc_ds(cache, bio, cell);
+               cell_defer(cache, cell, false);
+       }
+
+       return r;
+}
+
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
        struct cache *cache = ti->private;
@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                residency = policy_residency(cache->policy);
 
                DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
-                      (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+                      (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
                       (unsigned long long)nr_blocks_metadata,
                       cache->sectors_per_block,
@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
         */
        if (io_opt_sectors < cache->sectors_per_block ||
            do_div(io_opt_sectors, cache->sectors_per_block)) {
-               blk_limits_io_min(limits, 0);
+               blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
                blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
        }
        set_discard_limits(cache, limits);
@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
        .name = "cache",
-       .version = {1, 4, 0},
+       .version = {1, 5, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
index 4cba2d808afb451109cfbf4602790f4011eae191..2785007e0e462597d3b8e74839a8bfe25aa90b9c 100644 (file)
@@ -59,7 +59,7 @@ struct dm_crypt_io {
        int error;
        sector_t sector;
        struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
        struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
         */
        unsigned int dmreq_start;
 
+       unsigned int per_bio_data_size;
+
        unsigned long flags;
        unsigned int key_size;
        unsigned int key_parts;      /* independent parts in key buffer */
@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
            kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
+static void crypt_free_req(struct crypt_config *cc,
+                          struct ablkcipher_request *req, struct bio *base_bio)
+{
+       struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+       if ((struct ablkcipher_request *)(io + 1) != req)
+               mempool_free(req, cc->req_pool);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
        }
 }
 
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
-                                         struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+                         struct bio *bio, sector_t sector)
 {
-       struct dm_crypt_io *io;
-
-       io = mempool_alloc(cc->io_pool, GFP_NOIO);
        io->cc = cc;
        io->base_bio = bio;
        io->sector = sector;
@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
        io->base_io = NULL;
        io->ctx.req = NULL;
        atomic_set(&io->io_pending, 0);
-
-       return io;
 }
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
                return;
 
        if (io->ctx.req)
-               mempool_free(io->ctx.req, cc->req_pool);
-       mempool_free(io, cc->io_pool);
+               crypt_free_req(cc, io->ctx.req, base_bio);
+       if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+               mempool_free(io, cc->io_pool);
 
        if (likely(!base_io))
                bio_endio(base_bio, error);
@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                 * between fragments, so switch to a new dm_crypt_io structure.
                 */
                if (unlikely(!crypt_finished && remaining)) {
-                       new_io = crypt_io_alloc(io->cc, io->base_bio,
-                                               sector);
+                       new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+                       crypt_io_init(new_io, io->cc, io->base_bio, sector);
                        crypt_inc_pending(new_io);
                        crypt_convert_init(cc, &new_io->ctx, NULL,
                                           io->base_bio, sector);
@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        if (error < 0)
                io->error = -EIO;
 
-       mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+       crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
        if (!atomic_dec_and_test(&ctx->cc_pending))
                return;
@@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
 
+       cc->per_bio_data_size = ti->per_bio_data_size =
+                               sizeof(struct dm_crypt_io) + cc->dmreq_start +
+                               sizeof(struct dm_crypt_request) + cc->iv_size;
+
        cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
        if (!cc->page_pool) {
                ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
 
-       io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+       io = dm_per_bio_data(bio, cc->per_bio_data_size);
+       crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+       io->ctx.req = (struct ablkcipher_request *)(io + 1);
 
        if (bio_data_dir(io->base_bio) == READ) {
                if (kcryptd_io_read(io, GFP_NOWAIT))
index db404a0f7e2c83ead70bbf32e2346ecd60aa2edf..c09359db3a90730dbd32b3bd733709f3c6444192 100644 (file)
@@ -33,7 +33,6 @@ struct dm_io_client {
 struct io {
        unsigned long error_bits;
        atomic_t count;
-       struct completion *wait;
        struct dm_io_client *client;
        io_notify_fn callback;
        void *context;
@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
  * We need an io object to keep track of the number of bios that
  * have been dispatched for a particular io.
  *---------------------------------------------------------------*/
-static void dec_count(struct io *io, unsigned int region, int error)
+static void complete_io(struct io *io)
 {
-       if (error)
-               set_bit(region, &io->error_bits);
+       unsigned long error_bits = io->error_bits;
+       io_notify_fn fn = io->callback;
+       void *context = io->context;
 
-       if (atomic_dec_and_test(&io->count)) {
-               if (io->vma_invalidate_size)
-                       invalidate_kernel_vmap_range(io->vma_invalidate_address,
-                                                    io->vma_invalidate_size);
+       if (io->vma_invalidate_size)
+               invalidate_kernel_vmap_range(io->vma_invalidate_address,
+                                            io->vma_invalidate_size);
 
-               if (io->wait)
-                       complete(io->wait);
+       mempool_free(io, io->client->pool);
+       fn(error_bits, context);
+}
 
-               else {
-                       unsigned long r = io->error_bits;
-                       io_notify_fn fn = io->callback;
-                       void *context = io->context;
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+       if (error)
+               set_bit(region, &io->error_bits);
 
-                       mempool_free(io, io->client->pool);
-                       fn(r, context);
-               }
-       }
+       if (atomic_dec_and_test(&io->count))
+               complete_io(io);
 }
 
 static void endio(struct bio *bio, int error)
@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
        dec_count(io, 0, 0);
 }
 
+struct sync_io {
+       unsigned long error_bits;
+       struct completion wait;
+};
+
+static void sync_io_complete(unsigned long error, void *context)
+{
+       struct sync_io *sio = context;
+
+       sio->error_bits = error;
+       complete(&sio->wait);
+}
+
 static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                   struct dm_io_region *where, int rw, struct dpages *dp,
                   unsigned long *error_bits)
 {
-       /*
-        * gcc <= 4.3 can't do the alignment for stack variables, so we must
-        * align it on our own.
-        * volatile prevents the optimizer from removing or reusing
-        * "io_" field from the stack frame (allowed in ANSI C).
-        */
-       volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
-       struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
-       DECLARE_COMPLETION_ONSTACK(wait);
+       struct io *io;
+       struct sync_io sio;
 
        if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
                WARN_ON(1);
                return -EIO;
        }
 
+       init_completion(&sio.wait);
+
+       io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
-       io->wait = &wait;
        io->client = client;
+       io->callback = sync_io_complete;
+       io->context = &sio;
 
        io->vma_invalidate_address = dp->vma_invalidate_address;
        io->vma_invalidate_size = dp->vma_invalidate_size;
 
        dispatch_io(rw, num_regions, where, dp, io, 1);
 
-       wait_for_completion_io(&wait);
+       wait_for_completion_io(&sio.wait);
 
        if (error_bits)
-               *error_bits = io->error_bits;
+               *error_bits = sio.error_bits;
 
-       return io->error_bits ? -EIO : 0;
+       return sio.error_bits ? -EIO : 0;
 }
 
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
-       io->wait = NULL;
        io->client = client;
        io->callback = fn;
        io->context = context;
@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
  * New collapsed (a)synchronous interface.
  *
  * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set REQ_SYNC in
-io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
- * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
+ * If you fail to do one of these, the IO will be submitted to the disk after
+ * q->unplug_delay, which defaults to 3ms in blk-settings.c.
  */
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
          struct dm_io_region *where, unsigned long *sync_error_bits)
index f4167b013d990c3fc25f185ee467952824f0ae5d..833d7e752f0633a1586fcefb613e501d777efb30 100644 (file)
@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
                 dm_noflush_suspending(m->ti)));
 }
 
-#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
-
 /*
  * Map cloned requests
  */
@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
                if (!__must_push_back(m))
                        r = -EIO;       /* Failed */
                goto out_unlock;
-       }
-       if (!pg_ready(m)) {
+       } else if (m->queue_io || m->pg_init_required) {
                __pg_init_all_paths(m);
                goto out_unlock;
        }
+
        if (set_mapinfo(m, map_context) < 0)
                /* ENOMEM, requeue */
                goto out_unlock;
index 09a688b3d48ca1445e136544321a54b112b280e1..50fca469cafd92b3dac8c4455391cf24419de7ac 100644 (file)
@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
        *bit *= sctx->region_table_entry_bits;
 }
 
+static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
+{
+       unsigned long region_index;
+       unsigned bit;
+
+       switch_get_position(sctx, region_nr, &region_index, &bit);
+
+       return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+               ((1 << sctx->region_table_entry_bits) - 1);
+}
+
 /*
  * Find which path to use at given offset.
  */
 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 {
-       unsigned long region_index;
-       unsigned bit, path_nr;
+       unsigned path_nr;
        sector_t p;
 
        p = offset;
@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
        else
                sector_div(p, sctx->region_size);
 
-       switch_get_position(sctx, p, &region_index, &bit);
-       path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
-              ((1 << sctx->region_table_entry_bits) - 1);
+       path_nr = switch_region_table_read(sctx, p);
 
        /* This can only happen if the processor uses non-atomic stores. */
        if (unlikely(path_nr >= sctx->nr_paths))
@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
 }
 
 static int process_set_region_mappings(struct switch_ctx *sctx,
-                            unsigned argc, char **argv)
+                                      unsigned argc, char **argv)
 {
        unsigned i;
        unsigned long region_index = 0;
@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
                unsigned long path_nr;
                const char *string = argv[i];
 
+               if ((*string & 0xdf) == 'R') {
+                       unsigned long cycle_length, num_write;
+
+                       string++;
+                       if (unlikely(*string == ',')) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       cycle_length = parse_hex(&string);
+                       if (unlikely(*string != ',')) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       string++;
+                       if (unlikely(!*string)) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+                       num_write = parse_hex(&string);
+                       if (unlikely(*string)) {
+                               DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+                               return -EINVAL;
+                       }
+
+                       if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
+                               DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
+                                      cycle_length - 1, region_index);
+                               return -EINVAL;
+                       }
+                       if (unlikely(region_index + num_write < region_index) ||
+                           unlikely(region_index + num_write >= sctx->nr_regions)) {
+                               DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
+                                      region_index, num_write, sctx->nr_regions);
+                               return -EINVAL;
+                       }
+
+                       while (num_write--) {
+                               region_index++;
+                               path_nr = switch_region_table_read(sctx, region_index - cycle_length);
+                               switch_region_table_write(sctx, region_index, path_nr);
+                       }
+
+                       continue;
+               }
+
                if (*string == ':')
                        region_index++;
                else {
@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
 
 static struct target_type switch_target = {
        .name = "switch",
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = switch_ctr,
        .dtr = switch_dtr,
index 5f59f1e3e5b11de3156eef9e5bcd68b6a46c3b82..f9c6cb8dbcf8c493723f5ddc46af8d4e6fed892b 100644 (file)
@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
        return q && !blk_queue_add_random(q);
 }
 
+static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
+                                  sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+}
+
 static bool dm_table_all_devices_attribute(struct dm_table *t,
                                           iterate_devices_callout_fn func)
 {
@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
        return true;
 }
 
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+                                 sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && blk_queue_discard(q);
+}
+
+static bool dm_table_supports_discards(struct dm_table *t)
+{
+       struct dm_target *ti;
+       unsigned i = 0;
+
+       /*
+        * Unless any target used by the table set discards_supported,
+        * require at least one underlying device to support discards.
+        * t->devices includes internal dm devices such as mirror logs
+        * so we need to use iterate_devices here, which targets
+        * supporting discard selectively must provide.
+        */
+       while (i < dm_table_get_num_targets(t)) {
+               ti = dm_table_get_target(t, i++);
+
+               if (!ti->num_discard_bios)
+                       continue;
+
+               if (ti->discards_supported)
+                       return 1;
+
+               if (ti->type->iterate_devices &&
+                   ti->type->iterate_devices(ti, device_discard_capable, NULL))
+                       return 1;
+       }
+
+       return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                               struct queue_limits *limits)
 {
@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
        if (!dm_table_supports_write_same(t))
                q->limits.max_write_same_sectors = 0;
 
+       if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
+               queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+       else
+               queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+
        dm_table_set_integrity(t);
 
        /*
@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
 
-static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
-                                 sector_t start, sector_t len, void *data)
-{
-       struct request_queue *q = bdev_get_queue(dev->bdev);
-
-       return q && blk_queue_discard(q);
-}
-
-bool dm_table_supports_discards(struct dm_table *t)
-{
-       struct dm_target *ti;
-       unsigned i = 0;
-
-       /*
-        * Unless any target used by the table set discards_supported,
-        * require at least one underlying device to support discards.
-        * t->devices includes internal dm devices such as mirror logs
-        * so we need to use iterate_devices here, which targets
-        * supporting discard selectively must provide.
-        */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
-
-               if (!ti->num_discard_bios)
-                       continue;
-
-               if (ti->discards_supported)
-                       return 1;
-
-               if (ti->type->iterate_devices &&
-                   ti->type->iterate_devices(ti, device_discard_capable, NULL))
-                       return 1;
-       }
-
-       return 0;
-}
index fc9c848a60c9267a44296b54656bbdda60f40fd0..4843801173fe11a99519b59dd808e46e02425ee4 100644 (file)
@@ -227,6 +227,7 @@ struct thin_c {
        struct list_head list;
        struct dm_dev *pool_dev;
        struct dm_dev *origin_dev;
+       sector_t origin_size;
        dm_thin_id dev_id;
 
        struct pool *pool;
@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 struct dm_thin_new_mapping {
        struct list_head list;
 
-       bool quiesced:1;
-       bool prepared:1;
        bool pass_discard:1;
        bool definitely_not_shared:1;
 
+       /*
+        * Track quiescing, copying and zeroing preparation actions.  When this
+        * counter hits zero the block is prepared and can be inserted into the
+        * btree.
+        */
+       atomic_t prepare_actions;
+
        int err;
        struct thin_c *tc;
        dm_block_t virt_block;
@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
        bio_end_io_t *saved_bi_end_io;
 };
 
-static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
+static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
        struct pool *pool = m->tc->pool;
 
-       if (m->quiesced && m->prepared) {
+       if (atomic_dec_and_test(&m->prepare_actions)) {
                list_add_tail(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
 }
 
-static void copy_complete(int read_err, unsigned long write_err, void *context)
+static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
        unsigned long flags;
-       struct dm_thin_new_mapping *m = context;
        struct pool *pool = m->tc->pool;
 
-       m->err = read_err || write_err ? -EIO : 0;
-
        spin_lock_irqsave(&pool->lock, flags);
-       m->prepared = true;
-       __maybe_add_mapping(m);
+       __complete_mapping_preparation(m);
        spin_unlock_irqrestore(&pool->lock, flags);
 }
 
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+       struct dm_thin_new_mapping *m = context;
+
+       m->err = read_err || write_err ? -EIO : 0;
+       complete_mapping_preparation(m);
+}
+
 static void overwrite_endio(struct bio *bio, int err)
 {
-       unsigned long flags;
        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
        struct dm_thin_new_mapping *m = h->overwrite_mapping;
-       struct pool *pool = m->tc->pool;
 
        m->err = err;
-
-       spin_lock_irqsave(&pool->lock, flags);
-       m->prepared = true;
-       __maybe_add_mapping(m);
-       spin_unlock_irqrestore(&pool->lock, flags);
+       complete_mapping_preparation(m);
 }
 
 /*----------------------------------------------------------------*/
@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
        return m;
 }
 
+static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
+                   sector_t begin, sector_t end)
+{
+       int r;
+       struct dm_io_region to;
+
+       to.bdev = tc->pool_dev->bdev;
+       to.sector = begin;
+       to.count = end - begin;
+
+       r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
+       if (r < 0) {
+               DMERR_LIMIT("dm_kcopyd_zero() failed");
+               copy_complete(1, 1, m);
+       }
+}
+
+/*
+ * A partial copy also needs to zero the uncopied region.
+ */
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
                          struct dm_dev *origin, dm_block_t data_origin,
                          dm_block_t data_dest,
-                         struct dm_bio_prison_cell *cell, struct bio *bio)
+                         struct dm_bio_prison_cell *cell, struct bio *bio,
+                         sector_t len)
 {
        int r;
        struct pool *pool = tc->pool;
@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        m->data_block = data_dest;
        m->cell = cell;
 
+       /*
+        * quiesce action + copy action + an extra reference held for the
+        * duration of this function (we may need to inc later for a
+        * partial zero).
+        */
+       atomic_set(&m->prepare_actions, 3);
+
        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-               m->quiesced = true;
+               complete_mapping_preparation(m); /* already quiesced */
 
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
                from.bdev = origin->bdev;
                from.sector = data_origin * pool->sectors_per_block;
-               from.count = pool->sectors_per_block;
+               from.count = len;
 
                to.bdev = tc->pool_dev->bdev;
                to.sector = data_dest * pool->sectors_per_block;
-               to.count = pool->sectors_per_block;
+               to.count = len;
 
                r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
                                   0, copy_complete, m);
                if (r < 0) {
-                       mempool_free(m, pool->mapping_pool);
                        DMERR_LIMIT("dm_kcopyd_copy() failed");
-                       cell_error(pool, cell);
+                       copy_complete(1, 1, m);
+
+                       /*
+                        * We allow the zero to be issued, to simplify the
+                        * error path.  Otherwise we'd need to start
+                        * worrying about decrementing the prepare_actions
+                        * counter.
+                        */
+               }
+
+               /*
+                * Do we need to zero a tail region?
+                */
+               if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
+                       atomic_inc(&m->prepare_actions);
+                       ll_zero(tc, m,
+                               data_dest * pool->sectors_per_block + len,
+                               (data_dest + 1) * pool->sectors_per_block);
                }
        }
+
+       complete_mapping_preparation(m); /* drop our ref */
 }
 
 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
                                   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
        schedule_copy(tc, virt_block, tc->pool_dev,
-                     data_origin, data_dest, cell, bio);
-}
-
-static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
-                                  dm_block_t data_dest,
-                                  struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-       schedule_copy(tc, virt_block, tc->origin_dev,
-                     virt_block, data_dest, cell, bio);
+                     data_origin, data_dest, cell, bio,
+                     tc->pool->sectors_per_block);
 }
 
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-       m->quiesced = true;
-       m->prepared = false;
+       atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_block;
@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
                inc_all_io_entry(pool, bio);
                remap_and_issue(tc, bio, data_block);
-       } else {
-               int r;
-               struct dm_io_region to;
 
-               to.bdev = tc->pool_dev->bdev;
-               to.sector = data_block * pool->sectors_per_block;
-               to.count = pool->sectors_per_block;
+       } else
+               ll_zero(tc, m,
+                       data_block * pool->sectors_per_block,
+                       (data_block + 1) * pool->sectors_per_block);
+}
 
-               r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
-               if (r < 0) {
-                       mempool_free(m, pool->mapping_pool);
-                       DMERR_LIMIT("dm_kcopyd_zero() failed");
-                       cell_error(pool, cell);
-               }
-       }
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+                                  dm_block_t data_dest,
+                                  struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+       sector_t virt_block_begin = virt_block * pool->sectors_per_block;
+       sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
+
+       if (virt_block_end <= tc->origin_size)
+               schedule_copy(tc, virt_block, tc->origin_dev,
+                             virt_block, data_dest, cell, bio,
+                             pool->sectors_per_block);
+
+       else if (virt_block_begin < tc->origin_size)
+               schedule_copy(tc, virt_block, tc->origin_dev,
+                             virt_block, data_dest, cell, bio,
+                             tc->origin_size - virt_block_begin);
+
+       else
+               schedule_zero(tc, virt_block, data_dest, cell, bio);
 }
 
 /*
@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
                        inc_all_io_entry(pool, bio);
                        cell_defer_no_holder(tc, cell);
 
-                       remap_to_origin_and_issue(tc, bio);
+                       if (bio_end_sector(bio) <= tc->origin_size)
+                               remap_to_origin_and_issue(tc, bio);
+
+                       else if (bio->bi_iter.bi_sector < tc->origin_size) {
+                               zero_fill_bio(bio);
+                               bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
+                               remap_to_origin_and_issue(tc, bio);
+
+                       } else {
+                               zero_fill_bio(bio);
+                               bio_endio(bio, 0);
+                       }
                } else
                        provision_block(tc, bio, block, cell);
                break;
@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
         */
        if (io_opt_sectors < pool->sectors_per_block ||
            do_div(io_opt_sectors, pool->sectors_per_block)) {
-               blk_limits_io_min(limits, 0);
+               blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
                blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
        }
 
@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 12, 0},
+       .version = {1, 13, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                spin_lock_irqsave(&pool->lock, flags);
                list_for_each_entry_safe(m, tmp, &work, list) {
                        list_del(&m->list);
-                       m->quiesced = true;
-                       __maybe_add_mapping(m);
+                       __complete_mapping_preparation(m);
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        }
@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
        noflush_work(tc, do_noflush_stop);
 }
 
+static int thin_preresume(struct dm_target *ti)
+{
+       struct thin_c *tc = ti->private;
+
+       if (tc->origin_dev)
+               tc->origin_size = get_dev_size(tc->origin_dev->bdev);
+
+       return 0;
+}
+
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 12, 0},
+       .version = {1, 13, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
        .end_io = thin_endio,
+       .preresume = thin_preresume,
        .presuspend = thin_presuspend,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
index ed76126aac542e57d092a1d50a00135d8119c2ec..e81d2152fa684198899b1998efe3e5b19554b778 100644 (file)
@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_supports_discards(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
index 6352bec8419abcc252399520c184cf10866dd86d..71f387ce8cbd7ec6c8f269f9bb99135c19408306 100644 (file)
@@ -744,6 +744,7 @@ static struct usb_device_id rtsx_usb_usb_ids[] = {
        { USB_DEVICE(0x0BDA, 0x0140) },
        { }
 };
+MODULE_DEVICE_TABLE(usb, rtsx_usb_usb_ids);
 
 static struct usb_driver rtsx_usb_driver = {
        .name                   = "rtsx_usb",
index 452782bffebcfd5977a2935155cbe2b1e143f302..ede41f05c392d499542dac45f9228f37364ecf30 100644 (file)
@@ -2028,8 +2028,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
                /* complete ongoing async transfer before issuing discard */
                if (card->host->areq)
                        mmc_blk_issue_rw_rq(mq, NULL);
-               if (req->cmd_flags & REQ_SECURE &&
-                       !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
+               if (req->cmd_flags & REQ_SECURE)
                        ret = mmc_blk_issue_secdiscard_rq(mq, req);
                else
                        ret = mmc_blk_issue_discard_rq(mq, req);
@@ -2432,6 +2431,8 @@ static int mmc_blk_probe(struct mmc_card *card)
        if (!(card->csd.cmdclass & CCC_BLOCK_READ))
                return -ENODEV;
 
+       mmc_fixup_device(card, blk_fixups);
+
        md = mmc_blk_alloc(card);
        if (IS_ERR(md))
                return PTR_ERR(md);
@@ -2446,7 +2447,6 @@ static int mmc_blk_probe(struct mmc_card *card)
                goto out;
 
        mmc_set_drvdata(card, md);
-       mmc_fixup_device(card, blk_fixups);
 
        if (mmc_add_disk(md))
                goto out;
index d2dbf02022bd05803cda6ad5b0e9b4d201721b32..8a1f1240e05802f326986afdf093cd75f303496c 100644 (file)
@@ -180,7 +180,6 @@ static int mmc_bus_resume(struct device *dev)
 #endif
 
 #ifdef CONFIG_PM_RUNTIME
-
 static int mmc_runtime_suspend(struct device *dev)
 {
        struct mmc_card *card = mmc_dev_to_card(dev);
@@ -196,17 +195,10 @@ static int mmc_runtime_resume(struct device *dev)
 
        return host->bus_ops->runtime_resume(host);
 }
-
-static int mmc_runtime_idle(struct device *dev)
-{
-       return 0;
-}
-
 #endif /* !CONFIG_PM_RUNTIME */
 
 static const struct dev_pm_ops mmc_bus_pm_ops = {
-       SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume,
-                       mmc_runtime_idle)
+       SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume, NULL)
        SET_SYSTEM_SLEEP_PM_OPS(mmc_bus_suspend, mmc_bus_resume)
 };
 
index 7dc0c85fdb6067b980b7cfd636e4c23a98d1803c..d03a080fb9cd35ff0b29bf50380f08fdd8818c23 100644 (file)
@@ -2102,7 +2102,8 @@ EXPORT_SYMBOL(mmc_can_sanitize);
 
 int mmc_can_secure_erase_trim(struct mmc_card *card)
 {
-       if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN)
+       if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN) &&
+           !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
                return 1;
        return 0;
 }
index 793c6f7ddb049a735916eae87585b5cfb0f3b452..1eda8dd8c867228b5643e40f7655b513643eb26f 100644 (file)
@@ -324,13 +324,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
                }
        }
 
+       /*
+        * The EXT_CSD format is meant to be forward compatible. As long
+        * as CSD_STRUCTURE does not change, all values for EXT_CSD_REV
+        * are authorized, see JEDEC JESD84-B50 section B.8.
+        */
        card->ext_csd.rev = ext_csd[EXT_CSD_REV];
-       if (card->ext_csd.rev > 7) {
-               pr_err("%s: unrecognised EXT_CSD revision %d\n",
-                       mmc_hostname(card->host), card->ext_csd.rev);
-               err = -EINVAL;
-               goto out;
-       }
 
        card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
        card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
index 6c36fccaa1ec70d6e7f8dc5ac6396a77b90cde19..dd1d1e0fe32227edb4a184e3f8947ecbe27991b2 100644 (file)
@@ -91,7 +91,7 @@ void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table)
                    (f->cis_device == card->cis.device ||
                     f->cis_device == (u16) SDIO_ANY_ID) &&
                    rev >= f->rev_start && rev <= f->rev_end) {
-                       dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup);
+                       dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup);
                        f->vendor_fixup(card, f->data);
                }
        }
index 274ef00b44639149c3d43c24effffd400fa5ba69..48d0c93ba25a36590ac6e2c786c4ca923d35cde1 100644 (file)
@@ -184,6 +184,9 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
                mmc_delay(10);
        }
 
+       if (!i)
+               pr_err("%s: card never left busy state\n", mmc_hostname(host));
+
        if (rocr && !mmc_host_is_spi(host))
                *rocr = cmd.resp[0];
 
index a5652548230a9457812a8badc69d7001a9df1f9c..45113582246427eae3f9c6d893cdd6e305a1631b 100644 (file)
@@ -290,6 +290,18 @@ config MMC_MOXART
          be found on some embedded hardware such as UC-7112-LX.
          If you have a controller with this interface, say Y here.
 
+config MMC_SDHCI_ST
+       tristate "SDHCI support on STMicroelectronics SoC"
+       depends on ARCH_STI
+       depends on MMC_SDHCI_PLTFM
+       select MMC_SDHCI_IO_ACCESSORS
+       help
+         This selects the Secure Digital Host Controller Interface in
+         STMicroelectronics SoCs.
+
+         If you have a controller with this interface, say Y or M here.
+         If unsure, say N.
+
 config MMC_OMAP
        tristate "TI OMAP Multimedia Card Interface support"
        depends on ARCH_OMAP
@@ -303,6 +315,7 @@ config MMC_OMAP
 
 config MMC_OMAP_HS
        tristate "TI OMAP High Speed Multimedia Card Interface support"
+       depends on HAS_DMA
        depends on ARCH_OMAP2PLUS || COMPILE_TEST
        help
          This selects the TI OMAP High Speed Multimedia card Interface.
@@ -343,7 +356,7 @@ config MMC_ATMELMCI
 
 config MMC_SDHCI_MSM
        tristate "Qualcomm SDHCI Controller Support"
-       depends on ARCH_QCOM
+       depends on ARCH_QCOM || (ARM && COMPILE_TEST)
        depends on MMC_SDHCI_PLTFM
        help
          This selects the Secure Digital Host Controller Interface (SDHCI)
@@ -440,6 +453,7 @@ config MMC_SPI
 config MMC_S3C
        tristate "Samsung S3C SD/MMC Card Interface support"
        depends on ARCH_S3C24XX
+       depends on S3C24XX_DMAC
        help
          This selects a driver for the MCI interface found in
           Samsung's S3C2410, S3C2412, S3C2440, S3C2442 CPUs.
@@ -477,15 +491,6 @@ config MMC_S3C_DMA
          working properly and needs to be debugged before this
          option is useful.
 
-config MMC_S3C_PIODMA
-       bool "Support for both PIO and DMA"
-       help
-         Compile both the PIO and DMA transfer routines into the
-         driver and let the platform select at run-time which one
-         is best.
-
-         See notes for the DMA option.
-
 endchoice
 
 config MMC_SDRICOH_CS
@@ -623,7 +628,7 @@ config MMC_DW_PCI
 
 config MMC_SH_MMCIF
        tristate "SuperH Internal MMCIF support"
-       depends on MMC_BLOCK
+       depends on MMC_BLOCK && HAS_DMA
        depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST
        help
          This selects the MMC Host Interface controller (MMCIF).
@@ -697,6 +702,7 @@ config MMC_WMT
 
 config MMC_USDHI6ROL0
        tristate "Renesas USDHI6ROL0 SD/SDIO Host Controller support"
+       depends on HAS_DMA
        help
          This selects support for the Renesas USDHI6ROL0 SD/SDIO
          Host Controller
index 7f81ddf1dd2c9f87f64beb3c54a91d1a8faa7169..f211eede8db58d48887d5619d96c4cca6bdbdbab 100644 (file)
@@ -68,6 +68,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_HLWD)               += sdhci-of-hlwd.o
 obj-$(CONFIG_MMC_SDHCI_BCM_KONA)       += sdhci-bcm-kona.o
 obj-$(CONFIG_MMC_SDHCI_BCM2835)                += sdhci-bcm2835.o
 obj-$(CONFIG_MMC_SDHCI_MSM)            += sdhci-msm.o
+obj-$(CONFIG_MMC_SDHCI_ST)             += sdhci-st.o
 
 ifeq ($(CONFIG_CB710_DEBUG),y)
        CFLAGS-cb710-mmc        += -DDEBUG
index 1ac227c603b7e13a687b631eff298ce16222707d..8f216edbdf080d0c51e3eedcf83a75a1886fc9e3 100644 (file)
@@ -111,8 +111,7 @@ static const u8 tuning_blk_pattern_8bit[] = {
        0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
 };
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host);
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host);
+static bool dw_mci_reset(struct dw_mci *host);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -997,7 +996,8 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
        int gpio_ro = mmc_gpio_get_ro(mmc);
 
        /* Use platform get_ro function, else try on board write protect */
-       if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT)
+       if ((slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) ||
+                       (slot->host->quirks & DW_MCI_QUIRK_NO_WRITE_PROTECT))
                read_only = 0;
        else if (!IS_ERR_VALUE(gpio_ro))
                read_only = gpio_ro;
@@ -1235,7 +1235,7 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data)
                 * After an error, there may be data lingering
                 * in the FIFO
                 */
-               dw_mci_fifo_reset(host);
+               dw_mci_reset(host);
        } else {
                data->bytes_xfered = data->blocks * data->blksz;
                data->error = 0;
@@ -1352,7 +1352,7 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
                        /* CMD error in data command */
                        if (mrq->cmd->error && mrq->data)
-                               dw_mci_fifo_reset(host);
+                               dw_mci_reset(host);
 
                        host->cmd = NULL;
                        host->data = NULL;
@@ -1963,14 +1963,8 @@ static void dw_mci_work_routine_card(struct work_struct *work)
                        }
 
                        /* Power down slot */
-                       if (present == 0) {
-                               /* Clear down the FIFO */
-                               dw_mci_fifo_reset(host);
-#ifdef CONFIG_MMC_DW_IDMAC
-                               dw_mci_idmac_reset(host);
-#endif
-
-                       }
+                       if (present == 0)
+                               dw_mci_reset(host);
 
                        spin_unlock_bh(&host->lock);
 
@@ -2021,8 +2015,11 @@ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot)
 
        /* get quirks */
        for (idx = 0; idx < ARRAY_SIZE(of_slot_quirks); idx++)
-               if (of_get_property(np, of_slot_quirks[idx].quirk, NULL))
+               if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) {
+                       dev_warn(dev, "Slot quirk %s is deprecated\n",
+                                       of_slot_quirks[idx].quirk);
                        quirks |= of_slot_quirks[idx].id;
+               }
 
        return quirks;
 }
@@ -2208,8 +2205,11 @@ static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset)
        return false;
 }
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host)
+static bool dw_mci_reset(struct dw_mci *host)
 {
+       u32 flags = SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET;
+       bool ret = false;
+
        /*
         * Reseting generates a block interrupt, hence setting
         * the scatter-gather pointer to NULL.
@@ -2219,15 +2219,60 @@ static inline bool dw_mci_fifo_reset(struct dw_mci *host)
                host->sg = NULL;
        }
 
-       return dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET);
-}
+       if (host->use_dma)
+               flags |= SDMMC_CTRL_DMA_RESET;
 
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host)
-{
-       return dw_mci_ctrl_reset(host,
-                                SDMMC_CTRL_FIFO_RESET |
-                                SDMMC_CTRL_RESET |
-                                SDMMC_CTRL_DMA_RESET);
+       if (dw_mci_ctrl_reset(host, flags)) {
+               /*
+                * In all cases we clear the RAWINTS register to clear any
+                * interrupts.
+                */
+               mci_writel(host, RINTSTS, 0xFFFFFFFF);
+
+               /* if using dma we wait for dma_req to clear */
+               if (host->use_dma) {
+                       unsigned long timeout = jiffies + msecs_to_jiffies(500);
+                       u32 status;
+                       do {
+                               status = mci_readl(host, STATUS);
+                               if (!(status & SDMMC_STATUS_DMA_REQ))
+                                       break;
+                               cpu_relax();
+                       } while (time_before(jiffies, timeout));
+
+                       if (status & SDMMC_STATUS_DMA_REQ) {
+                               dev_err(host->dev,
+                                       "%s: Timeout waiting for dma_req to "
+                                       "clear during reset\n", __func__);
+                               goto ciu_out;
+                       }
+
+                       /* when using DMA next we reset the fifo again */
+                       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET))
+                               goto ciu_out;
+               }
+       } else {
+               /* if the controller reset bit did clear, then set clock regs */
+               if (!(mci_readl(host, CTRL) & SDMMC_CTRL_RESET)) {
+                       dev_err(host->dev, "%s: fifo/dma reset bits didn't "
+                               "clear but ciu was reset, doing clock update\n",
+                               __func__);
+                       goto ciu_out;
+               }
+       }
+
+#if IS_ENABLED(CONFIG_MMC_DW_IDMAC)
+       /* It is also recommended that we reset and reprogram idmac */
+       dw_mci_idmac_reset(host);
+#endif
+
+       ret = true;
+
+ciu_out:
+       /* After a CTRL reset we need to have CIU set clock registers  */
+       mci_send_cmd(host->cur_slot, SDMMC_CMD_UPD_CLK, 0);
+
+       return ret;
 }
 
 #ifdef CONFIG_OF
@@ -2238,6 +2283,9 @@ static struct dw_mci_of_quirks {
        {
                .quirk  = "broken-cd",
                .id     = DW_MCI_QUIRK_BROKEN_CARD_DETECTION,
+       }, {
+               .quirk  = "disable-wp",
+               .id     = DW_MCI_QUIRK_NO_WRITE_PROTECT,
        },
 };
 
@@ -2425,7 +2473,7 @@ int dw_mci_probe(struct dw_mci *host)
        }
 
        /* Reset all blocks */
-       if (!dw_mci_ctrl_all_reset(host))
+       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS))
                return -ENODEV;
 
        host->dma_ops = host->pdata->dma_ops;
@@ -2612,7 +2660,7 @@ int dw_mci_resume(struct dw_mci *host)
                }
        }
 
-       if (!dw_mci_ctrl_all_reset(host)) {
+       if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) {
                ret = -ENODEV;
                return ret;
        }
index 738fa241d05882258958107348a75b511c76fc08..08fd956d81f3bc687162e333cb612c37f06ad025 100644 (file)
 #define SDMMC_CMD_INDX(n)              ((n) & 0x1F)
 /* Status register defines */
 #define SDMMC_GET_FCNT(x)              (((x)>>17) & 0x1FFF)
+#define SDMMC_STATUS_DMA_REQ           BIT(31)
 /* FIFOTH register defines */
 #define SDMMC_SET_FIFOTH(m, r, t)      (((m) & 0x7) << 28 | \
                                         ((r) & 0xFFF) << 16 | \
 /* Card read threshold */
 #define SDMMC_SET_RD_THLD(v, x)                (((v) & 0x1FFF) << 16 | (x))
 
+/* All ctrl reset bits */
+#define SDMMC_CTRL_ALL_RESET_FLAGS \
+       (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET | SDMMC_CTRL_DMA_RESET)
+
 /* Register access macros */
 #define mci_readl(dev, reg)                    \
        __raw_readl((dev)->regs + SDMMC_##reg)
index 7ad463e9741c0e4359b0c2c10a43f83ea3683995..e4d470704150c257bdf0e535cefa05f4bc41a059 100644 (file)
@@ -52,34 +52,53 @@ static unsigned int fmax = 515633;
  * struct variant_data - MMCI variant-specific quirks
  * @clkreg: default value for MCICLOCK register
  * @clkreg_enable: enable value for MMCICLOCK register
+ * @clkreg_8bit_bus_enable: enable value for 8 bit bus
+ * @clkreg_neg_edge_enable: enable value for inverted data/cmd output
  * @datalength_bits: number of bits in the MMCIDATALENGTH register
  * @fifosize: number of bytes that can be written when MMCI_TXFIFOEMPTY
  *           is asserted (likewise for RX)
  * @fifohalfsize: number of bytes that can be written when MCI_TXFIFOHALFEMPTY
  *               is asserted (likewise for RX)
+ * @data_cmd_enable: enable value for data commands.
  * @sdio: variant supports SDIO
  * @st_clkdiv: true if using a ST-specific clock divider algorithm
+ * @datactrl_mask_ddrmode: ddr mode mask in datactrl register.
  * @blksz_datactrl16: true if Block size is at b16..b30 position in datactrl register
+ * @blksz_datactrl4: true if Block size is at b4..b16 position in datactrl
+ *                  register
  * @pwrreg_powerup: power up value for MMCIPOWER register
+ * @f_max: maximum clk frequency supported by the controller.
  * @signal_direction: input/out direction of bus signals can be indicated
  * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
  * @busy_detect: true if busy detection on dat0 is supported
  * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
+ * @explicit_mclk_control: enable explicit mclk control in driver.
+ * @qcom_fifo: enables qcom specific fifo pio read logic.
+ * @reversed_irq_handling: handle data irq before cmd irq.
  */
 struct variant_data {
        unsigned int            clkreg;
        unsigned int            clkreg_enable;
+       unsigned int            clkreg_8bit_bus_enable;
+       unsigned int            clkreg_neg_edge_enable;
        unsigned int            datalength_bits;
        unsigned int            fifosize;
        unsigned int            fifohalfsize;
+       unsigned int            data_cmd_enable;
+       unsigned int            datactrl_mask_ddrmode;
        bool                    sdio;
        bool                    st_clkdiv;
        bool                    blksz_datactrl16;
+       bool                    blksz_datactrl4;
        u32                     pwrreg_powerup;
+       u32                     f_max;
        bool                    signal_direction;
        bool                    pwrreg_clkgate;
        bool                    busy_detect;
        bool                    pwrreg_nopower;
+       bool                    explicit_mclk_control;
+       bool                    qcom_fifo;
+       bool                    reversed_irq_handling;
 };
 
 static struct variant_data variant_arm = {
@@ -87,6 +106,8 @@ static struct variant_data variant_arm = {
        .fifohalfsize           = 8 * 4,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
+       .reversed_irq_handling  = true,
 };
 
 static struct variant_data variant_arm_extended_fifo = {
@@ -94,6 +115,7 @@ static struct variant_data variant_arm_extended_fifo = {
        .fifohalfsize           = 64 * 4,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
 };
 
 static struct variant_data variant_arm_extended_fifo_hwfc = {
@@ -102,15 +124,18 @@ static struct variant_data variant_arm_extended_fifo_hwfc = {
        .clkreg_enable          = MCI_ARM_HWFCEN,
        .datalength_bits        = 16,
        .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 100000000,
 };
 
 static struct variant_data variant_u300 = {
        .fifosize               = 16 * 4,
        .fifohalfsize           = 8 * 4,
        .clkreg_enable          = MCI_ST_U300_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
        .datalength_bits        = 16,
        .sdio                   = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .pwrreg_nopower         = true,
@@ -124,6 +149,7 @@ static struct variant_data variant_nomadik = {
        .sdio                   = true,
        .st_clkdiv              = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .pwrreg_nopower         = true,
@@ -134,10 +160,13 @@ static struct variant_data variant_ux500 = {
        .fifohalfsize           = 8 * 4,
        .clkreg                 = MCI_CLK_ENABLE,
        .clkreg_enable          = MCI_ST_UX500_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+       .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE,
        .datalength_bits        = 24,
        .sdio                   = true,
        .st_clkdiv              = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
@@ -149,17 +178,38 @@ static struct variant_data variant_ux500v2 = {
        .fifohalfsize           = 8 * 4,
        .clkreg                 = MCI_CLK_ENABLE,
        .clkreg_enable          = MCI_ST_UX500_HWFCEN,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+       .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE,
+       .datactrl_mask_ddrmode  = MCI_ST_DPSM_DDRMODE,
        .datalength_bits        = 24,
        .sdio                   = true,
        .st_clkdiv              = true,
        .blksz_datactrl16       = true,
        .pwrreg_powerup         = MCI_PWR_ON,
+       .f_max                  = 100000000,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
        .pwrreg_nopower         = true,
 };
 
+static struct variant_data variant_qcom = {
+       .fifosize               = 16 * 4,
+       .fifohalfsize           = 8 * 4,
+       .clkreg                 = MCI_CLK_ENABLE,
+       .clkreg_enable          = MCI_QCOM_CLK_FLOWENA |
+                                 MCI_QCOM_CLK_SELECT_IN_FBCLK,
+       .clkreg_8bit_bus_enable = MCI_QCOM_CLK_WIDEBUS_8,
+       .datactrl_mask_ddrmode  = MCI_QCOM_CLK_SELECT_IN_DDR_MODE,
+       .data_cmd_enable        = MCI_QCOM_CSPM_DATCMD,
+       .blksz_datactrl4        = true,
+       .datalength_bits        = 24,
+       .pwrreg_powerup         = MCI_PWR_UP,
+       .f_max                  = 208000000,
+       .explicit_mclk_control  = true,
+       .qcom_fifo              = true,
+};
+
 static int mmci_card_busy(struct mmc_host *mmc)
 {
        struct mmci_host *host = mmc_priv(mmc);
@@ -260,7 +310,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
        host->cclk = 0;
 
        if (desired) {
-               if (desired >= host->mclk) {
+               if (variant->explicit_mclk_control) {
+                       host->cclk = host->mclk;
+               } else if (desired >= host->mclk) {
                        clk = MCI_CLK_BYPASS;
                        if (variant->st_clkdiv)
                                clk |= MCI_ST_UX500_NEG_EDGE;
@@ -299,11 +351,11 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
        if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_4)
                clk |= MCI_4BIT_BUS;
        if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8)
-               clk |= MCI_ST_8BIT_BUS;
+               clk |= variant->clkreg_8bit_bus_enable;
 
        if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
            host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-               clk |= MCI_ST_UX500_NEG_EDGE;
+               clk |= variant->clkreg_neg_edge_enable;
 
        mmci_write_clkreg(host, clk);
 }
@@ -719,7 +771,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
        data->bytes_xfered = 0;
 
        clks = (unsigned long long)data->timeout_ns * host->cclk;
-       do_div(clks, 1000000000UL);
+       do_div(clks, NSEC_PER_SEC);
 
        timeout = data->timeout_clks + (unsigned int)clks;
 
@@ -732,6 +784,8 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
        if (variant->blksz_datactrl16)
                datactrl = MCI_DPSM_ENABLE | (data->blksz << 16);
+       else if (variant->blksz_datactrl4)
+               datactrl = MCI_DPSM_ENABLE | (data->blksz << 4);
        else
                datactrl = MCI_DPSM_ENABLE | blksz_bits << 4;
 
@@ -767,7 +821,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
        if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
            host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-               datactrl |= MCI_ST_DPSM_DDRMODE;
+               datactrl |= variant->datactrl_mask_ddrmode;
 
        /*
         * Attempt to use DMA operation mode, if this
@@ -812,7 +866,7 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
 
        if (readl(base + MMCICOMMAND) & MCI_CPSM_ENABLE) {
                writel(0, base + MMCICOMMAND);
-               udelay(1);
+               mmci_reg_delay(host);
        }
 
        c |= cmd->opcode | MCI_CPSM_ENABLE;
@@ -824,6 +878,9 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
        if (/*interrupt*/0)
                c |= MCI_CPSM_INTERRUPT;
 
+       if (mmc_cmd_type(cmd) == MMC_CMD_ADTC)
+               c |= host->variant->data_cmd_enable;
+
        host->cmd = cmd;
 
        writel(cmd->arg, base + MMCIARGUMENT);
@@ -834,6 +891,10 @@ static void
 mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
              unsigned int status)
 {
+       /* Make sure we have data to handle */
+       if (!data)
+               return;
+
        /* First check for errors */
        if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
                      MCI_TXUNDERRUN|MCI_RXOVERRUN)) {
@@ -902,9 +963,17 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
             unsigned int status)
 {
        void __iomem *base = host->base;
-       bool sbc = (cmd == host->mrq->sbc);
-       bool busy_resp = host->variant->busy_detect &&
-                       (cmd->flags & MMC_RSP_BUSY);
+       bool sbc, busy_resp;
+
+       if (!cmd)
+               return;
+
+       sbc = (cmd == host->mrq->sbc);
+       busy_resp = host->variant->busy_detect && (cmd->flags & MMC_RSP_BUSY);
+
+       if (!((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
+               MCI_CMDSENT|MCI_CMDRESPEND)))
+               return;
 
        /* Check if we need to wait for busy completion. */
        if (host->busy_status && (status & MCI_ST_CARDBUSY))
@@ -957,15 +1026,34 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
        }
 }
 
+static int mmci_get_rx_fifocnt(struct mmci_host *host, u32 status, int remain)
+{
+       return remain - (readl(host->base + MMCIFIFOCNT) << 2);
+}
+
+static int mmci_qcom_get_rx_fifocnt(struct mmci_host *host, u32 status, int r)
+{
+       /*
+        * on qcom SDCC4 only 8 words are used in each burst so only 8 addresses
+        * from the fifo range should be used
+        */
+       if (status & MCI_RXFIFOHALFFULL)
+               return host->variant->fifohalfsize;
+       else if (status & MCI_RXDATAAVLBL)
+               return 4;
+
+       return 0;
+}
+
 static int mmci_pio_read(struct mmci_host *host, char *buffer, unsigned int remain)
 {
        void __iomem *base = host->base;
        char *ptr = buffer;
-       u32 status;
+       u32 status = readl(host->base + MMCISTATUS);
        int host_remain = host->size;
 
        do {
-               int count = host_remain - (readl(base + MMCIFIFOCNT) << 2);
+               int count = host->get_rx_fifocnt(host, status, host_remain);
 
                if (count > remain)
                        count = remain;
@@ -1132,9 +1220,6 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
        spin_lock(&host->lock);
 
        do {
-               struct mmc_command *cmd;
-               struct mmc_data *data;
-
                status = readl(host->base + MMCISTATUS);
 
                if (host->singleirq) {
@@ -1154,16 +1239,13 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
 
                dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status);
 
-               cmd = host->cmd;
-               if ((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
-                       MCI_CMDSENT|MCI_CMDRESPEND) && cmd)
-                       mmci_cmd_irq(host, cmd, status);
-
-               data = host->data;
-               if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
-                             MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND|
-                             MCI_DATABLOCKEND) && data)
-                       mmci_data_irq(host, data, status);
+               if (host->variant->reversed_irq_handling) {
+                       mmci_data_irq(host, host->data, status);
+                       mmci_cmd_irq(host, host->cmd, status);
+               } else {
+                       mmci_cmd_irq(host, host->cmd, status);
+                       mmci_data_irq(host, host->data, status);
+               }
 
                /* Don't poll for busy completion in irq context. */
                if (host->busy_status)
@@ -1296,6 +1378,17 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        if (!ios->clock && variant->pwrreg_clkgate)
                pwr &= ~MCI_PWR_ON;
 
+       if (host->variant->explicit_mclk_control &&
+           ios->clock != host->clock_cache) {
+               ret = clk_set_rate(host->clk, ios->clock);
+               if (ret < 0)
+                       dev_err(mmc_dev(host->mmc),
+                               "Error setting clock rate (%d)\n", ret);
+               else
+                       host->mclk = clk_get_rate(host->clk);
+       }
+       host->clock_cache = ios->clock;
+
        spin_lock_irqsave(&host->lock, flags);
 
        mmci_set_clkreg(host, ios->clock);
@@ -1443,6 +1536,11 @@ static int mmci_probe(struct amba_device *dev,
        if (ret)
                goto host_free;
 
+       if (variant->qcom_fifo)
+               host->get_rx_fifocnt = mmci_qcom_get_rx_fifocnt;
+       else
+               host->get_rx_fifocnt = mmci_get_rx_fifocnt;
+
        host->plat = plat;
        host->variant = variant;
        host->mclk = clk_get_rate(host->clk);
@@ -1451,8 +1549,8 @@ static int mmci_probe(struct amba_device *dev,
         * so we try to adjust the clock down to this,
         * (if possible).
         */
-       if (host->mclk > 100000000) {
-               ret = clk_set_rate(host->clk, 100000000);
+       if (host->mclk > variant->f_max) {
+               ret = clk_set_rate(host->clk, variant->f_max);
                if (ret < 0)
                        goto clk_disable;
                host->mclk = clk_get_rate(host->clk);
@@ -1471,9 +1569,12 @@ static int mmci_probe(struct amba_device *dev,
         * The ARM and ST versions of the block have slightly different
         * clock divider equations which means that the minimum divider
         * differs too.
+        * on Qualcomm like controllers get the nearest minimum clock to 100Khz
         */
        if (variant->st_clkdiv)
                mmc->f_min = DIV_ROUND_UP(host->mclk, 257);
+       else if (variant->explicit_mclk_control)
+               mmc->f_min = clk_round_rate(host->clk, 100000);
        else
                mmc->f_min = DIV_ROUND_UP(host->mclk, 512);
        /*
@@ -1483,9 +1584,14 @@ static int mmci_probe(struct amba_device *dev,
         * the block, of course.
         */
        if (mmc->f_max)
-               mmc->f_max = min(host->mclk, mmc->f_max);
+               mmc->f_max = variant->explicit_mclk_control ?
+                               min(variant->f_max, mmc->f_max) :
+                               min(host->mclk, mmc->f_max);
        else
-               mmc->f_max = min(host->mclk, fmax);
+               mmc->f_max = variant->explicit_mclk_control ?
+                               fmax : min(host->mclk, fmax);
+
+
        dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
 
        /* Get regulators and the supported OCR mask */
@@ -1752,6 +1858,12 @@ static struct amba_id mmci_ids[] = {
                .mask   = 0xf0ffffff,
                .data   = &variant_ux500v2,
        },
+       /* Qualcomm variants */
+       {
+               .id     = 0x00051180,
+               .mask   = 0x000fffff,
+               .data   = &variant_qcom,
+       },
        { 0, 0 },
 };
 
index 347d942d740bfd91d0e4aa9feef9ef1f1fd8aab1..a1f5e4f49e2a3367038268f5bfc7ed43961482e4 100644 (file)
 /* Modified PL180 on Versatile Express platform */
 #define MCI_ARM_HWFCEN         (1 << 12)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CLK_WIDEBUS_8 (BIT(10) | BIT(11))
+#define MCI_QCOM_CLK_FLOWENA   BIT(12)
+#define MCI_QCOM_CLK_INVERTOUT BIT(13)
+
+/* select in latch data and command in */
+#define MCI_QCOM_CLK_SELECT_IN_FBCLK   BIT(15)
+#define MCI_QCOM_CLK_SELECT_IN_DDR_MODE        (BIT(14) | BIT(15))
+
 #define MMCIARGUMENT           0x008
 #define MMCICOMMAND            0x00c
 #define MCI_CPSM_RESPONSE      (1 << 6)
 #define MCI_ST_NIEN            (1 << 13)
 #define MCI_ST_CE_ATACMD       (1 << 14)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CSPM_DATCMD           BIT(12)
+#define MCI_QCOM_CSPM_MCIABORT         BIT(13)
+#define MCI_QCOM_CSPM_CCSENABLE                BIT(14)
+#define MCI_QCOM_CSPM_CCSDISABLE       BIT(15)
+#define MCI_QCOM_CSPM_AUTO_CMD19       BIT(16)
+#define MCI_QCOM_CSPM_AUTO_CMD21       BIT(21)
+
 #define MMCIRESPCMD            0x010
 #define MMCIRESPONSE0          0x014
 #define MMCIRESPONSE1          0x018
@@ -191,6 +208,8 @@ struct mmci_host {
        spinlock_t              lock;
 
        unsigned int            mclk;
+       /* cached value of requested clk in set_ios */
+       unsigned int            clock_cache;
        unsigned int            cclk;
        u32                     pwr_reg;
        u32                     pwr_reg_add;
@@ -210,6 +229,7 @@ struct mmci_host {
        /* pio stuff */
        struct sg_mapping_iter  sg_miter;
        unsigned int            size;
+       int (*get_rx_fifocnt)(struct mmci_host *h, u32 status, int remain);
 
 #ifdef CONFIG_DMA_ENGINE
        /* DMA stuff */
index 74924a04026ea58d53e72855ae94dbd181ea7dab..b4b1efbf6c165c21aa1d8f1fe13e6bc9efb57f09 100644 (file)
@@ -13,7 +13,6 @@
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
index babfea03ba8a0ce50b64fae4e0bcbcc96ef0e60e..140885a5a4e734bbcb3806e39e7601503e1cf599 100644 (file)
@@ -86,7 +86,8 @@ static int mxs_mmc_get_cd(struct mmc_host *mmc)
        if (ret >= 0)
                return ret;
 
-       present = !(readl(ssp->base + HW_SSP_STATUS(ssp)) &
+       present = mmc->caps & MMC_CAP_NEEDS_POLL ||
+               !(readl(ssp->base + HW_SSP_STATUS(ssp)) &
                        BM_SSP_STATUS_CARD_DETECT);
 
        if (mmc->caps2 & MMC_CAP2_CD_ACTIVE_HIGH)
index 6b7b75585926c44d2fc39fdda4f0d58e73ec392a..965672663ef066a3b2e58c5f1a2478c5becdfdda 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/timer.h>
 #include <linux/clk.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
 #include <linux/omap-dmaengine.h>
@@ -36,6 +37,7 @@
 #include <linux/mmc/core.h>
 #include <linux/mmc/mmc.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/pinctrl/consumer.h>
@@ -54,6 +56,7 @@
 #define OMAP_HSMMC_RSP54       0x0118
 #define OMAP_HSMMC_RSP76       0x011C
 #define OMAP_HSMMC_DATA                0x0120
+#define OMAP_HSMMC_PSTATE      0x0124
 #define OMAP_HSMMC_HCTL                0x0128
 #define OMAP_HSMMC_SYSCTL      0x012C
 #define OMAP_HSMMC_STAT                0x0130
 #define BCE                    (1 << 1)
 #define FOUR_BIT               (1 << 1)
 #define HSPE                   (1 << 2)
+#define IWE                    (1 << 24)
 #define DDR                    (1 << 19)
+#define CLKEXTFREE             (1 << 16)
+#define CTPL                   (1 << 11)
 #define DW8                    (1 << 5)
 #define OD                     0x1
 #define STAT_CLEAR             0xFFFFFFFF
 #define SRD                    (1 << 26)
 #define SOFTRESET              (1 << 1)
 
+/* PSTATE */
+#define DLEV_DAT(x)            (1 << (20 + (x)))
+
 /* Interrupt masks for IE and ISE register */
 #define CC_EN                  (1 << 0)
 #define TC_EN                  (1 << 1)
 #define BWR_EN                 (1 << 4)
 #define BRR_EN                 (1 << 5)
+#define CIRQ_EN                        (1 << 8)
 #define ERR_EN                 (1 << 15)
 #define CTO_EN                 (1 << 16)
 #define CCRC_EN                        (1 << 17)
 #define VDD_3V0                        3000000         /* 300000 uV */
 #define VDD_165_195            (ffs(MMC_VDD_165_195) - 1)
 
-#define AUTO_CMD23             (1 << 1)        /* Auto CMD23 support */
 /*
  * One controller can have multiple slots, like on some omap boards using
  * omap.c controller driver. Luckily this is not currently done on any known
@@ -194,6 +203,7 @@ struct omap_hsmmc_host {
        u32                     sysctl;
        u32                     capa;
        int                     irq;
+       int                     wake_irq;
        int                     use_dma, dma_ch;
        struct dma_chan         *tx_chan;
        struct dma_chan         *rx_chan;
@@ -206,6 +216,9 @@ struct omap_hsmmc_host {
        int                     req_in_progress;
        unsigned long           clk_rate;
        unsigned int            flags;
+#define AUTO_CMD23             (1 << 0)        /* Auto CMD23 support */
+#define HSMMC_SDIO_IRQ_ENABLED (1 << 1)        /* SDIO irq enabled */
+#define HSMMC_WAKE_IRQ_ENABLED (1 << 2)
        struct omap_hsmmc_next  next_data;
        struct  omap_mmc_platform_data  *pdata;
 };
@@ -510,27 +523,40 @@ static void omap_hsmmc_stop_clock(struct omap_hsmmc_host *host)
 static void omap_hsmmc_enable_irq(struct omap_hsmmc_host *host,
                                  struct mmc_command *cmd)
 {
-       unsigned int irq_mask;
+       u32 irq_mask = INT_EN_MASK;
+       unsigned long flags;
 
        if (host->use_dma)
-               irq_mask = INT_EN_MASK & ~(BRR_EN | BWR_EN);
-       else
-               irq_mask = INT_EN_MASK;
+               irq_mask &= ~(BRR_EN | BWR_EN);
 
        /* Disable timeout for erases */
        if (cmd->opcode == MMC_ERASE)
                irq_mask &= ~DTO_EN;
 
+       spin_lock_irqsave(&host->irq_lock, flags);
        OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
        OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+       /* latch pending CIRQ, but don't signal MMC core */
+       if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+               irq_mask |= CIRQ_EN;
        OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+       spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 static void omap_hsmmc_disable_irq(struct omap_hsmmc_host *host)
 {
-       OMAP_HSMMC_WRITE(host->base, ISE, 0);
-       OMAP_HSMMC_WRITE(host->base, IE, 0);
+       u32 irq_mask = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&host->irq_lock, flags);
+       /* no transfer running but need to keep cirq if enabled */
+       if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+               irq_mask |= CIRQ_EN;
+       OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+       OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
        OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+       spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 /* Calculate divisor for the given clock frequency */
@@ -667,6 +693,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
                capa = VS18;
        }
 
+       if (host->mmc->caps & MMC_CAP_SDIO_IRQ)
+               hctl |= IWE;
+
        OMAP_HSMMC_WRITE(host->base, HCTL,
                        OMAP_HSMMC_READ(host->base, HCTL) | hctl);
 
@@ -681,7 +710,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
                && time_before(jiffies, timeout))
                ;
 
-       omap_hsmmc_disable_irq(host);
+       OMAP_HSMMC_WRITE(host->base, ISE, 0);
+       OMAP_HSMMC_WRITE(host->base, IE, 0);
+       OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
 
        /* Do not initialize card-specific things if the power is off */
        if (host->power_mode == MMC_POWER_OFF)
@@ -1118,8 +1149,12 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
        int status;
 
        status = OMAP_HSMMC_READ(host->base, STAT);
-       while (status & INT_EN_MASK && host->req_in_progress) {
-               omap_hsmmc_do_irq(host, status);
+       while (status & (INT_EN_MASK | CIRQ_EN)) {
+               if (host->req_in_progress)
+                       omap_hsmmc_do_irq(host, status);
+
+               if (status & CIRQ_EN)
+                       mmc_signal_sdio_irq(host->mmc);
 
                /* Flush posted write */
                status = OMAP_HSMMC_READ(host->base, STAT);
@@ -1128,6 +1163,22 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
+static irqreturn_t omap_hsmmc_wake_irq(int irq, void *dev_id)
+{
+       struct omap_hsmmc_host *host = dev_id;
+
+       /* cirq is level triggered, disable to avoid infinite loop */
+       spin_lock(&host->irq_lock);
+       if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+               disable_irq_nosync(host->wake_irq);
+               host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+       }
+       spin_unlock(&host->irq_lock);
+       pm_request_resume(host->dev); /* no use counter */
+
+       return IRQ_HANDLED;
+}
+
 static void set_sd_bus_power(struct omap_hsmmc_host *host)
 {
        unsigned long i;
@@ -1639,6 +1690,103 @@ static void omap_hsmmc_init_card(struct mmc_host *mmc, struct mmc_card *card)
                mmc_slot(host).init_card(card);
 }
 
+static void omap_hsmmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+       struct omap_hsmmc_host *host = mmc_priv(mmc);
+       u32 irq_mask, con;
+       unsigned long flags;
+
+       spin_lock_irqsave(&host->irq_lock, flags);
+
+       con = OMAP_HSMMC_READ(host->base, CON);
+       irq_mask = OMAP_HSMMC_READ(host->base, ISE);
+       if (enable) {
+               host->flags |= HSMMC_SDIO_IRQ_ENABLED;
+               irq_mask |= CIRQ_EN;
+               con |= CTPL | CLKEXTFREE;
+       } else {
+               host->flags &= ~HSMMC_SDIO_IRQ_ENABLED;
+               irq_mask &= ~CIRQ_EN;
+               con &= ~(CTPL | CLKEXTFREE);
+       }
+       OMAP_HSMMC_WRITE(host->base, CON, con);
+       OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+
+       /*
+        * if enable, piggy back detection on current request
+        * but always disable immediately
+        */
+       if (!host->req_in_progress || !enable)
+               OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+       /* flush posted write */
+       OMAP_HSMMC_READ(host->base, IE);
+
+       spin_unlock_irqrestore(&host->irq_lock, flags);
+}
+
+static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
+{
+       struct mmc_host *mmc = host->mmc;
+       int ret;
+
+       /*
+        * For omaps with wake-up path, wakeirq will be irq from pinctrl and
+        * for other omaps, wakeirq will be from GPIO (dat line remuxed to
+        * gpio). wakeirq is needed to detect sdio irq in runtime suspend state
+        * with functional clock disabled.
+        */
+       if (!host->dev->of_node || !host->wake_irq)
+               return -ENODEV;
+
+       /* Prevent auto-enabling of IRQ */
+       irq_set_status_flags(host->wake_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(host->dev, host->wake_irq, omap_hsmmc_wake_irq,
+                              IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+                              mmc_hostname(mmc), host);
+       if (ret) {
+               dev_err(mmc_dev(host->mmc), "Unable to request wake IRQ\n");
+               goto err;
+       }
+
+       /*
+        * Some omaps don't have wake-up path from deeper idle states
+        * and need to remux SDIO DAT1 to GPIO for wake-up from idle.
+        */
+       if (host->pdata->controller_flags & OMAP_HSMMC_SWAKEUP_MISSING) {
+               struct pinctrl *p = devm_pinctrl_get(host->dev);
+               if (!p) {
+                       ret = -ENODEV;
+                       goto err_free_irq;
+               }
+               if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT))) {
+                       dev_info(host->dev, "missing default pinctrl state\n");
+                       devm_pinctrl_put(p);
+                       ret = -EINVAL;
+                       goto err_free_irq;
+               }
+
+               if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_IDLE))) {
+                       dev_info(host->dev, "missing idle pinctrl state\n");
+                       devm_pinctrl_put(p);
+                       ret = -EINVAL;
+                       goto err_free_irq;
+               }
+               devm_pinctrl_put(p);
+       }
+
+       OMAP_HSMMC_WRITE(host->base, HCTL,
+                        OMAP_HSMMC_READ(host->base, HCTL) | IWE);
+       return 0;
+
+err_free_irq:
+       devm_free_irq(host->dev, host->wake_irq, host);
+err:
+       dev_warn(host->dev, "no SDIO IRQ support, falling back to polling\n");
+       host->wake_irq = 0;
+       return ret;
+}
+
 static void omap_hsmmc_conf_bus_power(struct omap_hsmmc_host *host)
 {
        u32 hctl, capa, value;
@@ -1691,7 +1839,7 @@ static const struct mmc_host_ops omap_hsmmc_ops = {
        .get_cd = omap_hsmmc_get_cd,
        .get_ro = omap_hsmmc_get_ro,
        .init_card = omap_hsmmc_init_card,
-       /* NYET -- enable_sdio_irq */
+       .enable_sdio_irq = omap_hsmmc_enable_sdio_irq,
 };
 
 #ifdef CONFIG_DEBUG_FS
@@ -1701,13 +1849,23 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data)
        struct mmc_host *mmc = s->private;
        struct omap_hsmmc_host *host = mmc_priv(mmc);
 
-       seq_printf(s, "mmc%d:\n ctx_loss:\t%d\n\nregs:\n",
-                       mmc->index, host->context_loss);
+       seq_printf(s, "mmc%d:\n", mmc->index);
+       seq_printf(s, "sdio irq mode\t%s\n",
+                  (mmc->caps & MMC_CAP_SDIO_IRQ) ? "interrupt" : "polling");
 
-       pm_runtime_get_sync(host->dev);
+       if (mmc->caps & MMC_CAP_SDIO_IRQ) {
+               seq_printf(s, "sdio irq \t%s\n",
+                          (host->flags & HSMMC_SDIO_IRQ_ENABLED) ?  "enabled"
+                          : "disabled");
+       }
+       seq_printf(s, "ctx_loss:\t%d\n", host->context_loss);
 
+       pm_runtime_get_sync(host->dev);
+       seq_puts(s, "\nregs:\n");
        seq_printf(s, "CON:\t\t0x%08x\n",
                        OMAP_HSMMC_READ(host->base, CON));
+       seq_printf(s, "PSTATE:\t\t0x%08x\n",
+                  OMAP_HSMMC_READ(host->base, PSTATE));
        seq_printf(s, "HCTL:\t\t0x%08x\n",
                        OMAP_HSMMC_READ(host->base, HCTL));
        seq_printf(s, "SYSCTL:\t\t0x%08x\n",
@@ -1761,6 +1919,10 @@ static const struct omap_mmc_of_data omap3_pre_es3_mmc_of_data = {
 static const struct omap_mmc_of_data omap4_mmc_of_data = {
        .reg_offset = 0x100,
 };
+static const struct omap_mmc_of_data am33xx_mmc_of_data = {
+       .reg_offset = 0x100,
+       .controller_flags = OMAP_HSMMC_SWAKEUP_MISSING,
+};
 
 static const struct of_device_id omap_mmc_of_match[] = {
        {
@@ -1777,6 +1939,10 @@ static const struct of_device_id omap_mmc_of_match[] = {
                .compatible = "ti,omap4-hsmmc",
                .data = &omap4_mmc_of_data,
        },
+       {
+               .compatible = "ti,am33xx-hsmmc",
+               .data = &am33xx_mmc_of_data,
+       },
        {},
 };
 MODULE_DEVICE_TABLE(of, omap_mmc_of_match);
@@ -1850,7 +2016,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
        const struct of_device_id *match;
        dma_cap_mask_t mask;
        unsigned tx_req, rx_req;
-       struct pinctrl *pinctrl;
        const struct omap_mmc_of_data *data;
        void __iomem *base;
 
@@ -1913,6 +2078,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, host);
 
+       if (pdev->dev.of_node)
+               host->wake_irq = irq_of_parse_and_map(pdev->dev.of_node, 1);
+
        mmc->ops        = &omap_hsmmc_ops;
 
        mmc->f_min = OMAP_MMC_MIN_CLOCK;
@@ -2061,10 +2229,17 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
        omap_hsmmc_disable_irq(host);
 
-       pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-       if (IS_ERR(pinctrl))
-               dev_warn(&pdev->dev,
-                       "pins are not configured from the driver\n");
+       /*
+        * For now, only support SDIO interrupt if we have a separate
+        * wake-up interrupt configured from device tree. This is because
+        * the wake-up interrupt is needed for idle state and some
+        * platforms need special quirks. And we don't want to add new
+        * legacy mux platform init code callbacks any longer as we
+        * are moving to DT based booting anyways.
+        */
+       ret = omap_hsmmc_configure_wake_irq(host);
+       if (!ret)
+               mmc->caps |= MMC_CAP_SDIO_IRQ;
 
        omap_hsmmc_protect_card(host);
 
@@ -2170,11 +2345,18 @@ static int omap_hsmmc_suspend(struct device *dev)
        pm_runtime_get_sync(host->dev);
 
        if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) {
-               omap_hsmmc_disable_irq(host);
+               OMAP_HSMMC_WRITE(host->base, ISE, 0);
+               OMAP_HSMMC_WRITE(host->base, IE, 0);
+               OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
                OMAP_HSMMC_WRITE(host->base, HCTL,
                                OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP);
        }
 
+       /* do not wake up due to sdio irq */
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+               disable_irq(host->wake_irq);
+
        if (host->dbclk)
                clk_disable_unprepare(host->dbclk);
 
@@ -2200,6 +2382,10 @@ static int omap_hsmmc_resume(struct device *dev)
 
        omap_hsmmc_protect_card(host);
 
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+               enable_irq(host->wake_irq);
+
        pm_runtime_mark_last_busy(host->dev);
        pm_runtime_put_autosuspend(host->dev);
        return 0;
@@ -2215,22 +2401,77 @@ static int omap_hsmmc_resume(struct device *dev)
 static int omap_hsmmc_runtime_suspend(struct device *dev)
 {
        struct omap_hsmmc_host *host;
+       unsigned long flags;
+       int ret = 0;
 
        host = platform_get_drvdata(to_platform_device(dev));
        omap_hsmmc_context_save(host);
        dev_dbg(dev, "disabled\n");
 
-       return 0;
+       spin_lock_irqsave(&host->irq_lock, flags);
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+               /* disable sdio irq handling to prevent race */
+               OMAP_HSMMC_WRITE(host->base, ISE, 0);
+               OMAP_HSMMC_WRITE(host->base, IE, 0);
+
+               if (!(OMAP_HSMMC_READ(host->base, PSTATE) & DLEV_DAT(1))) {
+                       /*
+                        * dat1 line low, pending sdio irq
+                        * race condition: possible irq handler running on
+                        * multi-core, abort
+                        */
+                       dev_dbg(dev, "pending sdio irq, abort suspend\n");
+                       OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+                       OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+                       OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+                       pm_runtime_mark_last_busy(dev);
+                       ret = -EBUSY;
+                       goto abort;
+               }
+
+               pinctrl_pm_select_idle_state(dev);
+
+               WARN_ON(host->flags & HSMMC_WAKE_IRQ_ENABLED);
+               enable_irq(host->wake_irq);
+               host->flags |= HSMMC_WAKE_IRQ_ENABLED;
+       } else {
+               pinctrl_pm_select_idle_state(dev);
+       }
+
+abort:
+       spin_unlock_irqrestore(&host->irq_lock, flags);
+       return ret;
 }
 
 static int omap_hsmmc_runtime_resume(struct device *dev)
 {
        struct omap_hsmmc_host *host;
+       unsigned long flags;
 
        host = platform_get_drvdata(to_platform_device(dev));
        omap_hsmmc_context_restore(host);
        dev_dbg(dev, "enabled\n");
 
+       spin_lock_irqsave(&host->irq_lock, flags);
+       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+           (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+               /* sdio irq flag can't change while in runtime suspend */
+               if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+                       disable_irq_nosync(host->wake_irq);
+                       host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+               }
+
+               pinctrl_pm_select_default_state(host->dev);
+
+               /* irq lost, if pinmux incorrect */
+               OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+               OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+               OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+       } else {
+               pinctrl_pm_select_default_state(host->dev);
+       }
+       spin_unlock_irqrestore(&host->irq_lock, flags);
        return 0;
 }
 
index f23782683a7c223dfa22db288cf308ef8c21ed1b..e5516a226362dc0c4e98c39215bfe2c55f87b992 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/clk.h>
 #include <linux/mmc/host.h>
@@ -27,6 +28,7 @@
 #include <mach/dma.h>
 #include <mach/gpio-samsung.h>
 
+#include <linux/platform_data/dma-s3c24xx.h>
 #include <linux/platform_data/mmc-s3cmci.h>
 
 #include "s3cmci.h"
@@ -140,10 +142,6 @@ static const int dbgmap_debug = dbg_err | dbg_debug;
                dev_dbg(&host->pdev->dev, args);  \
        } while (0)
 
-static struct s3c2410_dma_client s3cmci_dma_client = {
-       .name           = "s3c-mci",
-};
-
 static void finalize_request(struct s3cmci_host *host);
 static void s3cmci_send_request(struct mmc_host *mmc);
 static void s3cmci_reset(struct s3cmci_host *host);
@@ -256,25 +254,8 @@ static inline bool s3cmci_host_usedma(struct s3cmci_host *host)
 {
 #ifdef CONFIG_MMC_S3C_PIO
        return false;
-#elif defined(CONFIG_MMC_S3C_DMA)
+#else /* CONFIG_MMC_S3C_DMA */
        return true;
-#else
-       return host->dodma;
-#endif
-}
-
-/**
- * s3cmci_host_canpio - return true if host has pio code available
- *
- * Return true if the driver has been compiled with the PIO support code
- * available.
- */
-static inline bool s3cmci_host_canpio(void)
-{
-#ifdef CONFIG_MMC_S3C_PIO
-       return true;
-#else
-       return false;
 #endif
 }
 
@@ -841,60 +822,24 @@ static irqreturn_t s3cmci_irq_cd(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static void s3cmci_dma_done_callback(struct s3c2410_dma_chan *dma_ch,
-                                    void *buf_id, int size,
-                                    enum s3c2410_dma_buffresult result)
+static void s3cmci_dma_done_callback(void *arg)
 {
-       struct s3cmci_host *host = buf_id;
+       struct s3cmci_host *host = arg;
        unsigned long iflags;
-       u32 mci_csta, mci_dsta, mci_fsta, mci_dcnt;
-
-       mci_csta = readl(host->base + S3C2410_SDICMDSTAT);
-       mci_dsta = readl(host->base + S3C2410_SDIDSTA);
-       mci_fsta = readl(host->base + S3C2410_SDIFSTA);
-       mci_dcnt = readl(host->base + S3C2410_SDIDCNT);
 
        BUG_ON(!host->mrq);
        BUG_ON(!host->mrq->data);
-       BUG_ON(!host->dmatogo);
 
        spin_lock_irqsave(&host->complete_lock, iflags);
 
-       if (result != S3C2410_RES_OK) {
-               dbg(host, dbg_fail, "DMA FAILED: csta=0x%08x dsta=0x%08x "
-                       "fsta=0x%08x dcnt:0x%08x result:0x%08x toGo:%u\n",
-                       mci_csta, mci_dsta, mci_fsta,
-                       mci_dcnt, result, host->dmatogo);
-
-               goto fail_request;
-       }
-
-       host->dmatogo--;
-       if (host->dmatogo) {
-               dbg(host, dbg_dma, "DMA DONE  Size:%i DSTA:[%08x] "
-                       "DCNT:[%08x] toGo:%u\n",
-                       size, mci_dsta, mci_dcnt, host->dmatogo);
-
-               goto out;
-       }
-
-       dbg(host, dbg_dma, "DMA FINISHED Size:%i DSTA:%08x DCNT:%08x\n",
-               size, mci_dsta, mci_dcnt);
+       dbg(host, dbg_dma, "DMA FINISHED\n");
 
        host->dma_complete = 1;
        host->complete_what = COMPLETION_FINALIZE;
 
-out:
        tasklet_schedule(&host->pio_tasklet);
        spin_unlock_irqrestore(&host->complete_lock, iflags);
-       return;
 
-fail_request:
-       host->mrq->data->error = -EINVAL;
-       host->complete_what = COMPLETION_FINALIZE;
-       clear_imask(host);
-
-       goto out;
 }
 
 static void finalize_request(struct s3cmci_host *host)
@@ -966,7 +911,7 @@ static void finalize_request(struct s3cmci_host *host)
         * DMA channel and the fifo to clear out any garbage. */
        if (mrq->data->error != 0) {
                if (s3cmci_host_usedma(host))
-                       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
+                       dmaengine_terminate_all(host->dma);
 
                if (host->is2440) {
                        /* Clear failure register and reset fifo. */
@@ -992,29 +937,6 @@ request_done:
        mmc_request_done(host->mmc, mrq);
 }
 
-static void s3cmci_dma_setup(struct s3cmci_host *host,
-                            enum dma_data_direction source)
-{
-       static enum dma_data_direction last_source = -1;
-       static int setup_ok;
-
-       if (last_source == source)
-               return;
-
-       last_source = source;
-
-       s3c2410_dma_devconfig(host->dma, source,
-                             host->mem->start + host->sdidata);
-
-       if (!setup_ok) {
-               s3c2410_dma_config(host->dma, 4);
-               s3c2410_dma_set_buffdone_fn(host->dma,
-                                           s3cmci_dma_done_callback);
-               s3c2410_dma_setflags(host->dma, S3C2410_DMAF_AUTOSTART);
-               setup_ok = 1;
-       }
-}
-
 static void s3cmci_send_command(struct s3cmci_host *host,
                                        struct mmc_command *cmd)
 {
@@ -1162,43 +1084,45 @@ static int s3cmci_prepare_pio(struct s3cmci_host *host, struct mmc_data *data)
 
 static int s3cmci_prepare_dma(struct s3cmci_host *host, struct mmc_data *data)
 {
-       int dma_len, i;
        int rw = data->flags & MMC_DATA_WRITE;
+       struct dma_async_tx_descriptor *desc;
+       struct dma_slave_config conf = {
+               .src_addr = host->mem->start + host->sdidata,
+               .dst_addr = host->mem->start + host->sdidata,
+               .src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+               .dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+       };
 
        BUG_ON((data->flags & BOTH_DIR) == BOTH_DIR);
 
-       s3cmci_dma_setup(host, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-
-       dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
-                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-       if (dma_len == 0)
-               return -ENOMEM;
-
-       host->dma_complete = 0;
-       host->dmatogo = dma_len;
-
-       for (i = 0; i < dma_len; i++) {
-               int res;
-
-               dbg(host, dbg_dma, "enqueue %i: %08x@%u\n", i,
-                   sg_dma_address(&data->sg[i]),
-                   sg_dma_len(&data->sg[i]));
+       /* Restore prescaler value */
+       writel(host->prescaler, host->base + S3C2410_SDIPRE);
 
-               res = s3c2410_dma_enqueue(host->dma, host,
-                                         sg_dma_address(&data->sg[i]),
-                                         sg_dma_len(&data->sg[i]));
+       if (!rw)
+               conf.direction = DMA_DEV_TO_MEM;
+       else
+               conf.direction = DMA_MEM_TO_DEV;
 
-               if (res) {
-                       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-                       return -EBUSY;
-               }
-       }
+       dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
-       s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_START);
+       dmaengine_slave_config(host->dma, &conf);
+       desc = dmaengine_prep_slave_sg(host->dma, data->sg, data->sg_len,
+               conf.direction,
+               DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+       if (!desc)
+               goto unmap_exit;
+       desc->callback = s3cmci_dma_done_callback;
+       desc->callback_param = host;
+       dmaengine_submit(desc);
+       dma_async_issue_pending(host->dma);
 
        return 0;
+
+unmap_exit:
+       dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+                            rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+       return -ENOMEM;
 }
 
 static void s3cmci_send_request(struct mmc_host *mmc)
@@ -1676,10 +1600,6 @@ static int s3cmci_probe(struct platform_device *pdev)
        host->complete_what     = COMPLETION_NONE;
        host->pio_active        = XFER_NONE;
 
-#ifdef CONFIG_MMC_S3C_PIODMA
-       host->dodma             = host->pdata->use_dma;
-#endif
-
        host->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (!host->mem) {
                dev_err(&pdev->dev,
@@ -1765,17 +1685,17 @@ static int s3cmci_probe(struct platform_device *pdev)
        /* depending on the dma state, get a dma channel to use. */
 
        if (s3cmci_host_usedma(host)) {
-               host->dma = s3c2410_dma_request(DMACH_SDI, &s3cmci_dma_client,
-                                               host);
-               if (host->dma < 0) {
+               dma_cap_mask_t mask;
+
+               dma_cap_zero(mask);
+               dma_cap_set(DMA_SLAVE, mask);
+
+               host->dma = dma_request_slave_channel_compat(mask,
+                       s3c24xx_dma_filter, (void *)DMACH_SDI, &pdev->dev, "rx-tx");
+               if (!host->dma) {
                        dev_err(&pdev->dev, "cannot get DMA channel.\n");
-                       if (!s3cmci_host_canpio()) {
-                               ret = -EBUSY;
-                               goto probe_free_gpio_wp;
-                       } else {
-                               dev_warn(&pdev->dev, "falling back to PIO.\n");
-                               host->dodma = 0;
-                       }
+                       ret = -EBUSY;
+                       goto probe_free_gpio_wp;
                }
        }
 
@@ -1787,7 +1707,7 @@ static int s3cmci_probe(struct platform_device *pdev)
                goto probe_free_dma;
        }
 
-       ret = clk_enable(host->clk);
+       ret = clk_prepare_enable(host->clk);
        if (ret) {
                dev_err(&pdev->dev, "failed to enable clock source.\n");
                goto clk_free;
@@ -1816,7 +1736,7 @@ static int s3cmci_probe(struct platform_device *pdev)
        mmc->max_segs           = 128;
 
        dbg(host, dbg_debug,
-           "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%u.\n",
+           "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%p.\n",
            (host->is2440?"2440":""),
            host->base, host->irq, host->irq_cd, host->dma);
 
@@ -1845,14 +1765,14 @@ static int s3cmci_probe(struct platform_device *pdev)
        s3cmci_cpufreq_deregister(host);
 
  free_dmabuf:
-       clk_disable(host->clk);
+       clk_disable_unprepare(host->clk);
 
  clk_free:
        clk_put(host->clk);
 
  probe_free_dma:
        if (s3cmci_host_usedma(host))
-               s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+               dma_release_channel(host->dma);
 
  probe_free_gpio_wp:
        if (!host->pdata->no_wprotect)
@@ -1897,7 +1817,7 @@ static void s3cmci_shutdown(struct platform_device *pdev)
        s3cmci_debugfs_remove(host);
        s3cmci_cpufreq_deregister(host);
        mmc_remove_host(mmc);
-       clk_disable(host->clk);
+       clk_disable_unprepare(host->clk);
 }
 
 static int s3cmci_remove(struct platform_device *pdev)
@@ -1914,7 +1834,7 @@ static int s3cmci_remove(struct platform_device *pdev)
        tasklet_disable(&host->pio_tasklet);
 
        if (s3cmci_host_usedma(host))
-               s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+               dma_release_channel(host->dma);
 
        free_irq(host->irq, host);
 
index c76b53dbeb6179a3545bbb26b7aa87d50302477e..cc2e46cb5c643b07c791543852cd62f7eb29f1ad 100644 (file)
@@ -26,7 +26,7 @@ struct s3cmci_host {
        void __iomem            *base;
        int                     irq;
        int                     irq_cd;
-       int                     dma;
+       struct dma_chan         *dma;
 
        unsigned long           clk_rate;
        unsigned long           clk_div;
@@ -36,8 +36,6 @@ struct s3cmci_host {
        int                     is2440;
        unsigned                sdiimsk;
        unsigned                sdidata;
-       int                     dodma;
-       int                     dmatogo;
 
        bool                    irq_disabled;
        bool                    irq_enabled;
index 8ce3c28cb76ed503e9ea16605e059fa93780ea1a..8c5337002c5137ec5658cc79f929c25f378ab0f8 100644 (file)
@@ -124,9 +124,11 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = {
        .chip    = &sdhci_acpi_chip_int,
-       .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | MMC_CAP_HW_RESET,
+       .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
+                  MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR,
        .caps2   = MMC_CAP2_HC_ERASE_SZ,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
+       .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = {
index 40573a58486a165e659853b327b7c8e8f886863d..1a6661ed62050fda39cea0a57428abd2bb09e9e0 100644 (file)
@@ -16,7 +16,6 @@
 
 #include <linux/module.h>
 #include <linux/of_device.h>
-#include <linux/regulator/consumer.h>
 #include <linux/delay.h>
 #include <linux/mmc/mmc.h>
 #include <linux/slab.h>
index 52c42fcc284c4b3dca6cd889b562954d9bfc6352..c3a1debc9289860755a207cf1315ef544bdda21c 100644 (file)
@@ -103,6 +103,10 @@ static const struct sdhci_pci_fixes sdhci_cafe = {
                          SDHCI_QUIRK_BROKEN_TIMEOUT_VAL,
 };
 
+static const struct sdhci_pci_fixes sdhci_intel_qrk = {
+       .quirks         = SDHCI_QUIRK_NO_HISPD_BIT,
+};
+
 static int mrst_hc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA;
@@ -264,7 +268,7 @@ static void sdhci_pci_int_hw_reset(struct sdhci_host *host)
 static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
-                                MMC_CAP_HW_RESET;
+                                MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR;
        slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ;
        slot->hw_reset = sdhci_pci_int_hw_reset;
        return 0;
@@ -279,6 +283,7 @@ static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot)
 static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = {
        .allow_runtime_pm = true,
        .probe_slot     = byt_emmc_probe_slot,
+       .quirks2        = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = {
@@ -751,6 +756,14 @@ static const struct pci_device_id pci_ids[] = {
                .driver_data    = (kernel_ulong_t)&sdhci_rtsx,
        },
 
+       {
+               .vendor         = PCI_VENDOR_ID_INTEL,
+               .device         = PCI_DEVICE_ID_INTEL_QRK_SD,
+               .subvendor      = PCI_ANY_ID,
+               .subdevice      = PCI_ANY_ID,
+               .driver_data    = (kernel_ulong_t)&sdhci_intel_qrk,
+       },
+
        {
                .vendor         = PCI_VENDOR_ID_INTEL,
                .device         = PCI_DEVICE_ID_INTEL_MRST_SD0,
@@ -1130,18 +1143,13 @@ static int sdhci_pci_suspend(struct device *dev)
                        goto err_pci_suspend;
        }
 
-       pci_save_state(pdev);
        if (pm_flags & MMC_PM_KEEP_POWER) {
-               if (pm_flags & MMC_PM_WAKE_SDIO_IRQ) {
-                       pci_pme_active(pdev, true);
-                       pci_enable_wake(pdev, PCI_D3hot, 1);
-               }
-               pci_set_power_state(pdev, PCI_D3hot);
-       } else {
-               pci_enable_wake(pdev, PCI_D3hot, 0);
-               pci_disable_device(pdev);
-               pci_set_power_state(pdev, PCI_D3hot);
-       }
+               if (pm_flags & MMC_PM_WAKE_SDIO_IRQ)
+                       device_init_wakeup(dev, true);
+               else
+                       device_init_wakeup(dev, false);
+       } else
+               device_init_wakeup(dev, false);
 
        return 0;
 
@@ -1162,12 +1170,6 @@ static int sdhci_pci_resume(struct device *dev)
        if (!chip)
                return 0;
 
-       pci_set_power_state(pdev, PCI_D0);
-       pci_restore_state(pdev);
-       ret = pci_enable_device(pdev);
-       if (ret)
-               return ret;
-
        if (chip->fixes && chip->fixes->resume) {
                ret = chip->fixes->resume(chip);
                if (ret)
index 6d718719659e48abbc63ff7376f26b0dd6696d3b..c101477ef3be28364b31b50c92c88dd72909e3d6 100644 (file)
@@ -17,6 +17,7 @@
 #define PCI_DEVICE_ID_INTEL_CLV_SDIO2  0x08fb
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC0  0x08e5
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC1  0x08e6
+#define PCI_DEVICE_ID_INTEL_QRK_SD     0x08A7
 
 /*
  * PCI registers
index f4f128947561266e63b47f74c3af458c22aca27b..6f842fb8e6b81834de13f0ffef3e19af2b9cb3a0 100644 (file)
@@ -288,15 +288,13 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
        int ret;
        struct clk *clk;
 
-       pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL);
+       pxa = devm_kzalloc(&pdev->dev, sizeof(struct sdhci_pxa), GFP_KERNEL);
        if (!pxa)
                return -ENOMEM;
 
        host = sdhci_pltfm_init(pdev, &sdhci_pxav3_pdata, 0);
-       if (IS_ERR(host)) {
-               kfree(pxa);
+       if (IS_ERR(host))
                return PTR_ERR(host);
-       }
 
        if (of_device_is_compatible(np, "marvell,armada-380-sdhci")) {
                ret = mv_conf_mbus_windows(pdev, mv_mbus_dram_info());
@@ -308,7 +306,7 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
        pltfm_host = sdhci_priv(host);
        pltfm_host->priv = pxa;
 
-       clk = clk_get(dev, NULL);
+       clk = devm_clk_get(dev, NULL);
        if (IS_ERR(clk)) {
                dev_err(dev, "failed to get io clock\n");
                ret = PTR_ERR(clk);
@@ -389,11 +387,9 @@ err_add_host:
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
        clk_disable_unprepare(clk);
-       clk_put(clk);
 err_clk_get:
 err_mbus_win:
        sdhci_pltfm_free(pdev);
-       kfree(pxa);
        return ret;
 }
 
@@ -401,17 +397,14 @@ static int sdhci_pxav3_remove(struct platform_device *pdev)
 {
        struct sdhci_host *host = platform_get_drvdata(pdev);
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-       struct sdhci_pxa *pxa = pltfm_host->priv;
 
        pm_runtime_get_sync(&pdev->dev);
        sdhci_remove_host(host, 1);
        pm_runtime_disable(&pdev->dev);
 
        clk_disable_unprepare(pltfm_host->clk);
-       clk_put(pltfm_host->clk);
 
        sdhci_pltfm_free(pdev);
-       kfree(pxa);
 
        return 0;
 }
diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c
new file mode 100644 (file)
index 0000000..328f348
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Support for SDHCI on STMicroelectronics SoCs
+ *
+ * Copyright (C) 2014 STMicroelectronics Ltd
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ * Contributors: Peter Griffin <peter.griffin@linaro.org>
+ *
+ * Based on sdhci-cns3xxx.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/mmc/host.h>
+
+#include "sdhci-pltfm.h"
+
+static u32 sdhci_st_readl(struct sdhci_host *host, int reg)
+{
+       u32 ret;
+
+       switch (reg) {
+       case SDHCI_CAPABILITIES:
+               ret = readl_relaxed(host->ioaddr + reg);
+               /* Support 3.3V and 1.8V */
+               ret &= ~SDHCI_CAN_VDD_300;
+               break;
+       default:
+               ret = readl_relaxed(host->ioaddr + reg);
+       }
+       return ret;
+}
+
+static const struct sdhci_ops sdhci_st_ops = {
+       .get_max_clock = sdhci_pltfm_clk_get_max_clock,
+       .set_clock = sdhci_set_clock,
+       .set_bus_width = sdhci_set_bus_width,
+       .read_l = sdhci_st_readl,
+       .reset = sdhci_reset,
+};
+
+static const struct sdhci_pltfm_data sdhci_st_pdata = {
+       .ops = &sdhci_st_ops,
+       .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC |
+           SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
+};
+
+
+static int sdhci_st_probe(struct platform_device *pdev)
+{
+       struct sdhci_host *host;
+       struct sdhci_pltfm_host *pltfm_host;
+       struct clk *clk;
+       int ret = 0;
+       u16 host_version;
+
+       clk =  devm_clk_get(&pdev->dev, "mmc");
+       if (IS_ERR(clk)) {
+               dev_err(&pdev->dev, "Peripheral clk not found\n");
+               return PTR_ERR(clk);
+       }
+
+       host = sdhci_pltfm_init(pdev, &sdhci_st_pdata, 0);
+       if (IS_ERR(host)) {
+               dev_err(&pdev->dev, "Failed sdhci_pltfm_init\n");
+               return PTR_ERR(host);
+       }
+
+       ret = mmc_of_parse(host->mmc);
+
+       if (ret) {
+               dev_err(&pdev->dev, "Failed mmc_of_parse\n");
+               return ret;
+       }
+
+       clk_prepare_enable(clk);
+
+       pltfm_host = sdhci_priv(host);
+       pltfm_host->clk = clk;
+
+       ret = sdhci_add_host(host);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed sdhci_add_host\n");
+               goto err_out;
+       }
+
+       platform_set_drvdata(pdev, host);
+
+       host_version = readw_relaxed((host->ioaddr + SDHCI_HOST_VERSION));
+
+       dev_info(&pdev->dev, "SDHCI ST Initialised: Host Version: 0x%x Vendor Version 0x%x\n",
+               ((host_version & SDHCI_SPEC_VER_MASK) >> SDHCI_SPEC_VER_SHIFT),
+               ((host_version & SDHCI_VENDOR_VER_MASK) >>
+               SDHCI_VENDOR_VER_SHIFT));
+
+       return 0;
+
+err_out:
+       clk_disable_unprepare(clk);
+       sdhci_pltfm_free(pdev);
+
+       return ret;
+}
+
+static int sdhci_st_remove(struct platform_device *pdev)
+{
+       struct sdhci_host *host = platform_get_drvdata(pdev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+       clk_disable_unprepare(pltfm_host->clk);
+
+       return sdhci_pltfm_unregister(pdev);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int sdhci_st_suspend(struct device *dev)
+{
+       struct sdhci_host *host = dev_get_drvdata(dev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       int ret = sdhci_suspend_host(host);
+
+       if (ret)
+               goto out;
+
+       clk_disable_unprepare(pltfm_host->clk);
+out:
+       return ret;
+}
+
+static int sdhci_st_resume(struct device *dev)
+{
+       struct sdhci_host *host = dev_get_drvdata(dev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+       clk_prepare_enable(pltfm_host->clk);
+
+       return sdhci_resume_host(host);
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume);
+
+static const struct of_device_id st_sdhci_match[] = {
+       { .compatible = "st,sdhci" },
+       {},
+};
+
+MODULE_DEVICE_TABLE(of, st_sdhci_match);
+
+static struct platform_driver sdhci_st_driver = {
+       .probe = sdhci_st_probe,
+       .remove = sdhci_st_remove,
+       .driver = {
+                  .name = "sdhci-st",
+                  .pm = &sdhci_st_pmops,
+                  .of_match_table = of_match_ptr(st_sdhci_match),
+                 },
+};
+
+module_platform_driver(sdhci_st_driver);
+
+MODULE_DESCRIPTION("SDHCI driver for STMicroelectronics SoCs");
+MODULE_AUTHOR("Giuseppe Cavallaro <peppe.cavallaro@st.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:st-sdhci");
index d93a063a36f37bc5d25c6f6c1b3beba5c8d544a9..33100d10d17685b732279dc5c2d42b3f8950ee97 100644 (file)
@@ -26,8 +26,6 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/slot-gpio.h>
 
-#include <asm/gpio.h>
-
 #include "sdhci-pltfm.h"
 
 /* Tegra SDHOST controller vendor register definitions */
index 47055f3f01b8580e01ff147232d106bc14db3667..37b2a9ae52eff16cd44649f42fb4822ff05c89db 100644 (file)
@@ -1223,8 +1223,16 @@ EXPORT_SYMBOL_GPL(sdhci_set_clock);
 static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
                            unsigned short vdd)
 {
+       struct mmc_host *mmc = host->mmc;
        u8 pwr = 0;
 
+       if (!IS_ERR(mmc->supply.vmmc)) {
+               spin_unlock_irq(&host->lock);
+               mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd);
+               spin_lock_irq(&host->lock);
+               return;
+       }
+
        if (mode != MMC_POWER_OFF) {
                switch (1 << vdd) {
                case MMC_VDD_165_195:
@@ -1283,12 +1291,6 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
                if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER)
                        mdelay(10);
        }
-
-       if (host->vmmc) {
-               spin_unlock_irq(&host->lock);
-               mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd);
-               spin_lock_irq(&host->lock);
-       }
 }
 
 /*****************************************************************************\
@@ -1440,13 +1442,15 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 {
        unsigned long flags;
        u8 ctrl;
+       struct mmc_host *mmc = host->mmc;
 
        spin_lock_irqsave(&host->lock, flags);
 
        if (host->flags & SDHCI_DEVICE_DEAD) {
                spin_unlock_irqrestore(&host->lock, flags);
-               if (host->vmmc && ios->power_mode == MMC_POWER_OFF)
-                       mmc_regulator_set_ocr(host->mmc, host->vmmc, 0);
+               if (!IS_ERR(mmc->supply.vmmc) &&
+                   ios->power_mode == MMC_POWER_OFF)
+                       mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
                return;
        }
 
@@ -1530,7 +1534,6 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
                        host->ops->set_clock(host, host->clock);
                }
 
-
                /* Reset SD Clock Enable */
                clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL);
                clk &= ~SDHCI_CLOCK_CARD_EN;
@@ -1707,6 +1710,7 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable)
 static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                                                struct mmc_ios *ios)
 {
+       struct mmc_host *mmc = host->mmc;
        u16 ctrl;
        int ret;
 
@@ -1725,11 +1729,12 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                ctrl &= ~SDHCI_CTRL_VDD_180;
                sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc, 2700000, 3600000);
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc, 2700000,
+                                                   3600000);
                        if (ret) {
                                pr_warning("%s: Switching to 3.3V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -1742,16 +1747,16 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                        return 0;
 
                pr_warning("%s: 3.3V regulator output did not became stable\n",
-                               mmc_hostname(host->mmc));
+                               mmc_hostname(mmc));
 
                return -EAGAIN;
        case MMC_SIGNAL_VOLTAGE_180:
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc,
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc,
                                        1700000, 1950000);
                        if (ret) {
                                pr_warning("%s: Switching to 1.8V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -1763,24 +1768,22 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
                ctrl |= SDHCI_CTRL_VDD_180;
                sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-               /* Wait for 5ms */
-               usleep_range(5000, 5500);
-
                /* 1.8V regulator output should be stable within 5 ms */
                ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
                if (ctrl & SDHCI_CTRL_VDD_180)
                        return 0;
 
                pr_warning("%s: 1.8V regulator output did not became stable\n",
-                               mmc_hostname(host->mmc));
+                               mmc_hostname(mmc));
 
                return -EAGAIN;
        case MMC_SIGNAL_VOLTAGE_120:
-               if (host->vqmmc) {
-                       ret = regulator_set_voltage(host->vqmmc, 1100000, 1300000);
+               if (!IS_ERR(mmc->supply.vqmmc)) {
+                       ret = regulator_set_voltage(mmc->supply.vqmmc, 1100000,
+                                                   1300000);
                        if (ret) {
                                pr_warning("%s: Switching to 1.2V signalling voltage "
-                                               " failed\n", mmc_hostname(host->mmc));
+                                               " failed\n", mmc_hostname(mmc));
                                return -EIO;
                        }
                }
@@ -2643,7 +2646,6 @@ static void sdhci_runtime_pm_bus_off(struct sdhci_host *host)
 int sdhci_runtime_suspend_host(struct sdhci_host *host)
 {
        unsigned long flags;
-       int ret = 0;
 
        /* Disable tuning since we are suspending */
        if (host->flags & SDHCI_USING_RETUNING_TIMER) {
@@ -2663,14 +2665,14 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host)
        host->runtime_suspended = true;
        spin_unlock_irqrestore(&host->lock, flags);
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_suspend_host);
 
 int sdhci_runtime_resume_host(struct sdhci_host *host)
 {
        unsigned long flags;
-       int ret = 0, host_flags = host->flags;
+       int host_flags = host->flags;
 
        if (host_flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
                if (host->ops->enable_dma)
@@ -2709,7 +2711,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host)
 
        spin_unlock_irqrestore(&host->lock, flags);
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_resume_host);
 
@@ -2820,12 +2822,12 @@ int sdhci_add_host(struct sdhci_host *host)
                 * (128) and potentially one alignment transfer for
                 * each of those entries.
                 */
-               host->adma_desc = dma_alloc_coherent(mmc_dev(host->mmc),
+               host->adma_desc = dma_alloc_coherent(mmc_dev(mmc),
                                                     ADMA_SIZE, &host->adma_addr,
                                                     GFP_KERNEL);
                host->align_buffer = kmalloc(128 * 4, GFP_KERNEL);
                if (!host->adma_desc || !host->align_buffer) {
-                       dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+                       dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                          host->adma_desc, host->adma_addr);
                        kfree(host->align_buffer);
                        pr_warning("%s: Unable to allocate ADMA "
@@ -2838,7 +2840,7 @@ int sdhci_add_host(struct sdhci_host *host)
                        pr_warning("%s: unable to allocate aligned ADMA descriptor\n",
                                   mmc_hostname(mmc));
                        host->flags &= ~SDHCI_USE_ADMA;
-                       dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+                       dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                          host->adma_desc, host->adma_addr);
                        kfree(host->align_buffer);
                        host->adma_desc = NULL;
@@ -2853,7 +2855,7 @@ int sdhci_add_host(struct sdhci_host *host)
         */
        if (!(host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA))) {
                host->dma_mask = DMA_BIT_MASK(64);
-               mmc_dev(host->mmc)->dma_mask = &host->dma_mask;
+               mmc_dev(mmc)->dma_mask = &host->dma_mask;
        }
 
        if (host->version >= SDHCI_SPEC_300)
@@ -2959,28 +2961,25 @@ int sdhci_add_host(struct sdhci_host *host)
                mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
 
        if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
-           !(host->mmc->caps & MMC_CAP_NONREMOVABLE))
+           !(mmc->caps & MMC_CAP_NONREMOVABLE))
                mmc->caps |= MMC_CAP_NEEDS_POLL;
 
+       /* If there are external regulators, get them */
+       if (mmc_regulator_get_supply(mmc) == -EPROBE_DEFER)
+               return -EPROBE_DEFER;
+
        /* If vqmmc regulator and no 1.8V signalling, then there's no UHS */
-       host->vqmmc = regulator_get_optional(mmc_dev(mmc), "vqmmc");
-       if (IS_ERR_OR_NULL(host->vqmmc)) {
-               if (PTR_ERR(host->vqmmc) < 0) {
-                       pr_info("%s: no vqmmc regulator found\n",
-                               mmc_hostname(mmc));
-                       host->vqmmc = NULL;
-               }
-       } else {
-               ret = regulator_enable(host->vqmmc);
-               if (!regulator_is_supported_voltage(host->vqmmc, 1700000,
-                       1950000))
+       if (!IS_ERR(mmc->supply.vqmmc)) {
+               ret = regulator_enable(mmc->supply.vqmmc);
+               if (!regulator_is_supported_voltage(mmc->supply.vqmmc, 1700000,
+                                                   1950000))
                        caps[1] &= ~(SDHCI_SUPPORT_SDR104 |
                                        SDHCI_SUPPORT_SDR50 |
                                        SDHCI_SUPPORT_DDR50);
                if (ret) {
                        pr_warn("%s: Failed to enable vqmmc regulator: %d\n",
                                mmc_hostname(mmc), ret);
-                       host->vqmmc = NULL;
+                       mmc->supply.vqmmc = NULL;
                }
        }
 
@@ -3041,34 +3040,6 @@ int sdhci_add_host(struct sdhci_host *host)
 
        ocr_avail = 0;
 
-       host->vmmc = regulator_get_optional(mmc_dev(mmc), "vmmc");
-       if (IS_ERR_OR_NULL(host->vmmc)) {
-               if (PTR_ERR(host->vmmc) < 0) {
-                       pr_info("%s: no vmmc regulator found\n",
-                               mmc_hostname(mmc));
-                       host->vmmc = NULL;
-               }
-       }
-
-#ifdef CONFIG_REGULATOR
-       /*
-        * Voltage range check makes sense only if regulator reports
-        * any voltage value.
-        */
-       if (host->vmmc && regulator_get_voltage(host->vmmc) > 0) {
-               ret = regulator_is_supported_voltage(host->vmmc, 2700000,
-                       3600000);
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_330)))
-                       caps[0] &= ~SDHCI_CAN_VDD_330;
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_300)))
-                       caps[0] &= ~SDHCI_CAN_VDD_300;
-               ret = regulator_is_supported_voltage(host->vmmc, 1700000,
-                       1950000);
-               if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_180)))
-                       caps[0] &= ~SDHCI_CAN_VDD_180;
-       }
-#endif /* CONFIG_REGULATOR */
-
        /*
         * According to SD Host Controller spec v3.00, if the Host System
         * can afford more than 150mA, Host Driver should set XPC to 1. Also
@@ -3077,8 +3048,8 @@ int sdhci_add_host(struct sdhci_host *host)
         * value.
         */
        max_current_caps = sdhci_readl(host, SDHCI_MAX_CURRENT);
-       if (!max_current_caps && host->vmmc) {
-               u32 curr = regulator_get_current_limit(host->vmmc);
+       if (!max_current_caps && !IS_ERR(mmc->supply.vmmc)) {
+               u32 curr = regulator_get_current_limit(mmc->supply.vmmc);
                if (curr > 0) {
 
                        /* convert to SDHCI_MAX_CURRENT format */
@@ -3118,8 +3089,12 @@ int sdhci_add_host(struct sdhci_host *host)
                                   SDHCI_MAX_CURRENT_MULTIPLIER;
        }
 
+       /* If OCR set by external regulators, use it instead */
+       if (mmc->ocr_avail)
+               ocr_avail = mmc->ocr_avail;
+
        if (host->ocr_mask)
-               ocr_avail = host->ocr_mask;
+               ocr_avail &= host->ocr_mask;
 
        mmc->ocr_avail = ocr_avail;
        mmc->ocr_avail_sdio = ocr_avail;
@@ -3273,6 +3248,7 @@ EXPORT_SYMBOL_GPL(sdhci_add_host);
 
 void sdhci_remove_host(struct sdhci_host *host, int dead)
 {
+       struct mmc_host *mmc = host->mmc;
        unsigned long flags;
 
        if (dead) {
@@ -3282,7 +3258,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
                if (host->mrq) {
                        pr_err("%s: Controller removed during "
-                               " transfer!\n", mmc_hostname(host->mmc));
+                               " transfer!\n", mmc_hostname(mmc));
 
                        host->mrq->cmd->error = -ENOMEDIUM;
                        tasklet_schedule(&host->finish_tasklet);
@@ -3293,7 +3269,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
        sdhci_disable_card_detection(host);
 
-       mmc_remove_host(host->mmc);
+       mmc_remove_host(mmc);
 
 #ifdef SDHCI_USE_LEDS_CLASS
        led_classdev_unregister(&host->led);
@@ -3310,18 +3286,14 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
        tasklet_kill(&host->finish_tasklet);
 
-       if (host->vmmc) {
-               regulator_disable(host->vmmc);
-               regulator_put(host->vmmc);
-       }
+       if (!IS_ERR(mmc->supply.vmmc))
+               regulator_disable(mmc->supply.vmmc);
 
-       if (host->vqmmc) {
-               regulator_disable(host->vqmmc);
-               regulator_put(host->vqmmc);
-       }
+       if (!IS_ERR(mmc->supply.vqmmc))
+               regulator_disable(mmc->supply.vqmmc);
 
        if (host->adma_desc)
-               dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+               dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
                                  host->adma_desc, host->adma_addr);
        kfree(host->align_buffer);
 
index 656fbba4c4223f275dffba85ebbc29ffb4574174..d11708c815d721ba7dd9626ff21a523331ac4392 100644 (file)
@@ -386,7 +386,7 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
                         struct sh_mmcif_plat_data *pdata,
                         enum dma_transfer_direction direction)
 {
-       struct dma_slave_config cfg;
+       struct dma_slave_config cfg = { 0, };
        struct dma_chan *chan;
        unsigned int slave_id;
        struct resource *res;
@@ -417,8 +417,15 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
        /* In the OF case the driver will get the slave ID from the DT */
        cfg.slave_id = slave_id;
        cfg.direction = direction;
-       cfg.dst_addr = res->start + MMCIF_CE_DATA;
-       cfg.src_addr = 0;
+
+       if (direction == DMA_DEV_TO_MEM) {
+               cfg.src_addr = res->start + MMCIF_CE_DATA;
+               cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+       } else {
+               cfg.dst_addr = res->start + MMCIF_CE_DATA;
+               cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+       }
+
        ret = dmaengine_slave_config(chan, &cfg);
        if (ret < 0) {
                dma_release_channel(chan);
@@ -1378,26 +1385,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
                dev_err(&pdev->dev, "Get irq error\n");
                return -ENXIO;
        }
+
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!res) {
-               dev_err(&pdev->dev, "platform_get_resource error.\n");
-               return -ENXIO;
-       }
-       reg = ioremap(res->start, resource_size(res));
-       if (!reg) {
-               dev_err(&pdev->dev, "ioremap error.\n");
-               return -ENOMEM;
-       }
+       reg = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(reg))
+               return PTR_ERR(reg);
 
        mmc = mmc_alloc_host(sizeof(struct sh_mmcif_host), &pdev->dev);
-       if (!mmc) {
-               ret = -ENOMEM;
-               goto ealloch;
-       }
+       if (!mmc)
+               return -ENOMEM;
 
        ret = mmc_of_parse(mmc);
        if (ret < 0)
-               goto eofparse;
+               goto err_host;
 
        host            = mmc_priv(mmc);
        host->mmc       = mmc;
@@ -1427,19 +1427,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
        pm_runtime_enable(&pdev->dev);
        host->power = false;
 
-       host->hclk = clk_get(&pdev->dev, NULL);
+       host->hclk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(host->hclk)) {
                ret = PTR_ERR(host->hclk);
                dev_err(&pdev->dev, "cannot get clock: %d\n", ret);
-               goto eclkget;
+               goto err_pm;
        }
        ret = sh_mmcif_clk_update(host);
        if (ret < 0)
-               goto eclkupdate;
+               goto err_pm;
 
        ret = pm_runtime_resume(&pdev->dev);
        if (ret < 0)
-               goto eresume;
+               goto err_clk;
 
        INIT_DELAYED_WORK(&host->timeout_work, mmcif_timeout_work);
 
@@ -1447,65 +1447,55 @@ static int sh_mmcif_probe(struct platform_device *pdev)
        sh_mmcif_writel(host->addr, MMCIF_CE_INT_MASK, MASK_ALL);
 
        name = irq[1] < 0 ? dev_name(&pdev->dev) : "sh_mmc:error";
-       ret = request_threaded_irq(irq[0], sh_mmcif_intr, sh_mmcif_irqt, 0, name, host);
+       ret = devm_request_threaded_irq(&pdev->dev, irq[0], sh_mmcif_intr,
+                                       sh_mmcif_irqt, 0, name, host);
        if (ret) {
                dev_err(&pdev->dev, "request_irq error (%s)\n", name);
-               goto ereqirq0;
+               goto err_clk;
        }
        if (irq[1] >= 0) {
-               ret = request_threaded_irq(irq[1], sh_mmcif_intr, sh_mmcif_irqt,
-                                          0, "sh_mmc:int", host);
+               ret = devm_request_threaded_irq(&pdev->dev, irq[1],
+                                               sh_mmcif_intr, sh_mmcif_irqt,
+                                               0, "sh_mmc:int", host);
                if (ret) {
                        dev_err(&pdev->dev, "request_irq error (sh_mmc:int)\n");
-                       goto ereqirq1;
+                       goto err_clk;
                }
        }
 
        if (pd && pd->use_cd_gpio) {
                ret = mmc_gpio_request_cd(mmc, pd->cd_gpio, 0);
                if (ret < 0)
-                       goto erqcd;
+                       goto err_clk;
        }
 
        mutex_init(&host->thread_lock);
 
-       clk_disable_unprepare(host->hclk);
        ret = mmc_add_host(mmc);
        if (ret < 0)
-               goto emmcaddh;
+               goto err_clk;
 
        dev_pm_qos_expose_latency_limit(&pdev->dev, 100);
 
-       dev_info(&pdev->dev, "driver version %s\n", DRIVER_VERSION);
-       dev_dbg(&pdev->dev, "chip ver H'%04x\n",
-               sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0x0000ffff);
+       dev_info(&pdev->dev, "Chip version 0x%04x, clock rate %luMHz\n",
+                sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0xffff,
+                clk_get_rate(host->hclk) / 1000000UL);
+
+       clk_disable_unprepare(host->hclk);
        return ret;
 
-emmcaddh:
-erqcd:
-       if (irq[1] >= 0)
-               free_irq(irq[1], host);
-ereqirq1:
-       free_irq(irq[0], host);
-ereqirq0:
-       pm_runtime_suspend(&pdev->dev);
-eresume:
+err_clk:
        clk_disable_unprepare(host->hclk);
-eclkupdate:
-       clk_put(host->hclk);
-eclkget:
+err_pm:
        pm_runtime_disable(&pdev->dev);
-eofparse:
+err_host:
        mmc_free_host(mmc);
-ealloch:
-       iounmap(reg);
        return ret;
 }
 
 static int sh_mmcif_remove(struct platform_device *pdev)
 {
        struct sh_mmcif_host *host = platform_get_drvdata(pdev);
-       int irq[2];
 
        host->dying = true;
        clk_prepare_enable(host->hclk);
@@ -1523,16 +1513,6 @@ static int sh_mmcif_remove(struct platform_device *pdev)
         */
        cancel_delayed_work_sync(&host->timeout_work);
 
-       if (host->addr)
-               iounmap(host->addr);
-
-       irq[0] = platform_get_irq(pdev, 0);
-       irq[1] = platform_get_irq(pdev, 1);
-
-       free_irq(irq[0], host);
-       if (irq[1] >= 0)
-               free_irq(irq[1], host);
-
        clk_disable_unprepare(host->hclk);
        mmc_free_host(host->mmc);
        pm_runtime_put_sync(&pdev->dev);
index 03e7b280cb4c71addd6d10fe52989a3206dd6c18..eb8f1d5c34b157f51c01210a273bf2c81257ac2d 100644 (file)
@@ -294,6 +294,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
                        cfg.slave_id = pdata->dma->slave_id_tx;
                cfg.direction = DMA_MEM_TO_DEV;
                cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
+               cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
                cfg.src_addr = 0;
                ret = dmaengine_slave_config(host->chan_tx, &cfg);
                if (ret < 0)
@@ -312,6 +313,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
                        cfg.slave_id = pdata->dma->slave_id_rx;
                cfg.direction = DMA_DEV_TO_MEM;
                cfg.src_addr = cfg.dst_addr;
+               cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
                cfg.dst_addr = 0;
                ret = dmaengine_slave_config(host->chan_rx, &cfg);
                if (ret < 0)
index 282891a8e451e77040faaf26e5267e70595cc4ba..54181b4f6e9eab58cdb057e6d3d85b92ee2cadf0 100644 (file)
@@ -72,7 +72,6 @@
 #define BM_SPI_CS                      0x20
 #define BM_SD_POWER                    0x40
 #define BM_SOFT_RESET                  0x80
-#define BM_ONEBIT_MASK                 0xFD
 
 /* SDMMC_BLKLEN bit fields */
 #define BLKL_CRCERR_ABORT              0x0800
 #define STS2_DATARSP_BUSY              0x20
 #define STS2_DIS_FORCECLK              0x80
 
+/* SDMMC_EXTCTRL bit fields */
+#define EXT_EIGHTBIT                   0x04
 
 /* MMC/SD DMA Controller Registers */
 #define SDDMA_GCR                      0x100
@@ -672,7 +673,7 @@ static void wmt_mci_request(struct mmc_host *mmc, struct mmc_request *req)
 static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 {
        struct wmt_mci_priv *priv;
-       u32 reg_tmp;
+       u32 busmode, extctrl;
 
        priv = mmc_priv(mmc);
 
@@ -687,28 +688,26 @@ static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        if (ios->clock != 0)
                clk_set_rate(priv->clk_sdmmc, ios->clock);
 
+       busmode = readb(priv->sdmmc_base + SDMMC_BUSMODE);
+       extctrl = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
+
+       busmode &= ~(BM_EIGHTBIT_MODE | BM_FOURBIT_MODE);
+       extctrl &= ~EXT_EIGHTBIT;
+
        switch (ios->bus_width) {
        case MMC_BUS_WIDTH_8:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp | 0x04, priv->sdmmc_base + SDMMC_EXTCTRL);
+               busmode |= BM_EIGHTBIT_MODE;
+               extctrl |= EXT_EIGHTBIT;
                break;
        case MMC_BUS_WIDTH_4:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-               writeb(reg_tmp | BM_FOURBIT_MODE, priv->sdmmc_base +
-                      SDMMC_BUSMODE);
-
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
+               busmode |= BM_FOURBIT_MODE;
                break;
        case MMC_BUS_WIDTH_1:
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-               writeb(reg_tmp & BM_ONEBIT_MASK, priv->sdmmc_base +
-                      SDMMC_BUSMODE);
-
-               reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-               writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
                break;
        }
+
+       writeb(busmode, priv->sdmmc_base + SDMMC_BUSMODE);
+       writeb(extctrl, priv->sdmmc_base + SDMMC_EXTCTRL);
 }
 
 static int wmt_mci_get_ro(struct mmc_host *mmc)
@@ -830,7 +829,7 @@ static int wmt_mci_probe(struct platform_device *pdev)
                goto fail3;
        }
 
-       ret = request_irq(dma_irq, wmt_mci_dma_isr, 32, "sdmmc", priv);
+       ret = request_irq(dma_irq, wmt_mci_dma_isr, 0, "sdmmc", priv);
        if (ret) {
                dev_err(&pdev->dev, "Register DMA IRQ fail\n");
                goto fail4;
index af7c40ac1455bb52a139a9399983aed0f5de9b17..e1a8f4e19983ffe4f87139af1849906348648333 100644 (file)
@@ -581,7 +581,11 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
        struct xgene_enet_desc_ring *ring;
        struct xgene_enet_pdata *pdata = netdev_priv(ndev);
        struct device *dev = ndev_to_dev(ndev);
-       u32 size;
+       int size;
+
+       size = xgene_enet_get_ring_size(dev, cfgsize);
+       if (size < 0)
+               return NULL;
 
        ring = devm_kzalloc(dev, sizeof(struct xgene_enet_desc_ring),
                            GFP_KERNEL);
@@ -593,7 +597,6 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
        ring->cfgsize = cfgsize;
        ring->id = ring_id;
 
-       size = xgene_enet_get_ring_size(dev, cfgsize);
        ring->desc_addr = dma_zalloc_coherent(dev, size, &ring->dma,
                                              GFP_KERNEL);
        if (!ring->desc_addr) {
index a3dd5dc64f4cfe1e60524d5f67eaa6b1bf83cea4..4296b3d26f02f63851f134e85516bbb9f7e7ddce 100644 (file)
@@ -14093,8 +14093,9 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
 
        spin_lock_bh(&tp->lock);
        if (!tp->hw_stats) {
+               *stats = tp->net_stats_prev;
                spin_unlock_bh(&tp->lock);
-               return &tp->net_stats_prev;
+               return stats;
        }
 
        tg3_get_nstats(tp, stats);
index 811f1351db7a6f4b1841478d776fdf0aa6431cc0..43e08d0bc3d316ff407555bf1ec170c44c3c2be2 100644 (file)
@@ -897,5 +897,6 @@ void be_roce_dev_remove(struct be_adapter *);
  */
 void be_roce_dev_open(struct be_adapter *);
 void be_roce_dev_close(struct be_adapter *);
+void be_roce_dev_shutdown(struct be_adapter *);
 
 #endif                         /* BE_H */
index db4ff14ff18f0adcffdcb2d8ca54b9e46f65f7c5..9cdeda54674aed9a2765454a327901d2733c6d2c 100644 (file)
@@ -5014,6 +5014,7 @@ static void be_shutdown(struct pci_dev *pdev)
        if (!adapter)
                return;
 
+       be_roce_dev_shutdown(adapter);
        cancel_delayed_work_sync(&adapter->work);
        cancel_delayed_work_sync(&adapter->func_recovery_work);
 
index 5bf16603a3e9d9854b39554a75065dde75c11eff..ef4672dc735750d24186c987fd1a34e478b21b60 100644 (file)
@@ -120,7 +120,8 @@ static void _be_roce_dev_open(struct be_adapter *adapter)
 {
        if (ocrdma_drv && adapter->ocrdma_dev &&
            ocrdma_drv->state_change_handler)
-               ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 0);
+               ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+                                                BE_DEV_UP);
 }
 
 void be_roce_dev_open(struct be_adapter *adapter)
@@ -136,7 +137,8 @@ static void _be_roce_dev_close(struct be_adapter *adapter)
 {
        if (ocrdma_drv && adapter->ocrdma_dev &&
            ocrdma_drv->state_change_handler)
-               ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 1);
+               ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+                                                BE_DEV_DOWN);
 }
 
 void be_roce_dev_close(struct be_adapter *adapter)
@@ -148,6 +150,18 @@ void be_roce_dev_close(struct be_adapter *adapter)
        }
 }
 
+void be_roce_dev_shutdown(struct be_adapter *adapter)
+{
+       if (be_roce_supported(adapter)) {
+               mutex_lock(&be_adapter_list_lock);
+               if (ocrdma_drv && adapter->ocrdma_dev &&
+                   ocrdma_drv->state_change_handler)
+                       ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+                                                        BE_DEV_SHUTDOWN);
+               mutex_unlock(&be_adapter_list_lock);
+       }
+}
+
 int be_roce_register_driver(struct ocrdma_driver *drv)
 {
        struct be_adapter *dev;
index a3d9e96c18eb88ff720fd59af6d1a975bc5f71b1..e6f7eb1a7d879b23ba57e8e03f301f38b7c33d12 100644 (file)
@@ -62,7 +62,8 @@ struct ocrdma_driver {
 
 enum {
        BE_DEV_UP       = 0,
-       BE_DEV_DOWN     = 1
+       BE_DEV_DOWN     = 1,
+       BE_DEV_SHUTDOWN = 2
 };
 
 /* APIs for RoCE driver to register callback handlers,
index 775d9969b5c2bfbac9638602ee327cd1f4980502..cd473e29524268f7d9092fa4c51de79997e1e338 100644 (file)
@@ -1,6 +1,6 @@
 #
 # Makefile for the eHEA ethernet device driver for IBM eServer System p
 #
-ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o ehea_phyp.o
+ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o
 obj-$(CONFIG_EHEA) += ehea.o
 
index 58856032298db4847d1ad7fe546ccbf9e9f85884..06edfca1a35e74752af584a53f8857083f4273f3 100644 (file)
@@ -47,7 +47,7 @@ static u8 e1000_calculate_checksum(u8 *buffer, u32 length)
  *  e1000_mng_enable_host_if - Checks host interface is enabled
  *  @hw: pointer to the HW structure
  *
- *  Returns E1000_success upon success, else E1000_ERR_HOST_INTERFACE_COMMAND
+ *  Returns 0 upon success, else -E1000_ERR_HOST_INTERFACE_COMMAND
  *
  *  This function checks whether the HOST IF is enabled for command operation
  *  and also checks whether the previous command is completed.  It busy waits
@@ -78,7 +78,7 @@ static s32 e1000_mng_enable_host_if(struct e1000_hw *hw)
        }
 
        if (i == E1000_MNG_DHCP_COMMAND_TIMEOUT) {
-               e_dbg("Previous command timeout failed .\n");
+               e_dbg("Previous command timeout failed.\n");
                return -E1000_ERR_HOST_INTERFACE_COMMAND;
        }
 
index 6938fc1ad877bc08bf3b67ea3ae3e7f79ac0943e..5d01db1d789b0c64ddf4871a06fee3a34a19af8f 100644 (file)
@@ -33,6 +33,7 @@
 #include <scsi/fc/fc_fcoe.h>
 #include <scsi/libfc.h>
 #include <scsi/libfcoe.h>
+#include <uapi/linux/dcbnl.h>
 
 #include "i40e.h"
 #include "i40e_fcoe.h"
index 51bc03072ed3dee4e05ec62bc3180c14c44eb3e0..871474f6fe62bd1e9e09df8168e020d230653e7b 100644 (file)
@@ -4415,13 +4415,13 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 
        switch (vsi->back->hw.phy.link_info.link_speed) {
        case I40E_LINK_SPEED_40GB:
-               strncpy(speed, "40 Gbps", SPEED_SIZE);
+               strlcpy(speed, "40 Gbps", SPEED_SIZE);
                break;
        case I40E_LINK_SPEED_10GB:
-               strncpy(speed, "10 Gbps", SPEED_SIZE);
+               strlcpy(speed, "10 Gbps", SPEED_SIZE);
                break;
        case I40E_LINK_SPEED_1GB:
-               strncpy(speed, "1000 Mbps", SPEED_SIZE);
+               strlcpy(speed, "1000 Mbps", SPEED_SIZE);
                break;
        default:
                break;
@@ -4429,16 +4429,16 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 
        switch (vsi->back->hw.fc.current_mode) {
        case I40E_FC_FULL:
-               strncpy(fc, "RX/TX", FC_SIZE);
+               strlcpy(fc, "RX/TX", FC_SIZE);
                break;
        case I40E_FC_TX_PAUSE:
-               strncpy(fc, "TX", FC_SIZE);
+               strlcpy(fc, "TX", FC_SIZE);
                break;
        case I40E_FC_RX_PAUSE:
-               strncpy(fc, "RX", FC_SIZE);
+               strlcpy(fc, "RX", FC_SIZE);
                break;
        default:
-               strncpy(fc, "None", FC_SIZE);
+               strlcpy(fc, "None", FC_SIZE);
                break;
        }
 
@@ -5839,7 +5839,7 @@ static void i40e_send_version(struct i40e_pf *pf)
        dv.minor_version = DRV_VERSION_MINOR;
        dv.build_version = DRV_VERSION_BUILD;
        dv.subbuild_version = 0;
-       strncpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string));
+       strlcpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string));
        i40e_aq_send_driver_version(&pf->hw, &dv, NULL);
 }
 
@@ -6293,7 +6293,7 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
 
        if (alloc_qvectors) {
                /* allocate memory for q_vector pointers */
-               size = sizeof(struct i40e_q_vectors *) * vsi->num_q_vectors;
+               size = sizeof(struct i40e_q_vector *) * vsi->num_q_vectors;
                vsi->q_vectors = kzalloc(size, GFP_KERNEL);
                if (!vsi->q_vectors) {
                        ret = -ENOMEM;
index 97bda3dffd4959fe1edec07570f4a95d0c28dc30..25c4f9a3011f59715f1d9489587d14c310a10097 100644 (file)
@@ -251,9 +251,9 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset,
  *
  * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
  **/
-i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
-                             u32 offset, u16 words, void *data,
-                             bool last_command)
+static i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+                                    u32 offset, u16 words, void *data,
+                                    bool last_command)
 {
        i40e_status ret_code = I40E_ERR_NVM;
 
index 5d940a26055c64ab71250f1d998b9b38418fb2ec..65a4a0f88ea0d80090bc2e604fe0f7e2b0bb2fad 100644 (file)
@@ -1310,6 +1310,15 @@ static struct mlx4_cmd_info cmd_info[] = {
                .verify = NULL,
                .wrapper = mlx4_MAD_IFC_wrapper
        },
+       {
+               .opcode = MLX4_CMD_MAD_DEMUX,
+               .has_inbox = false,
+               .has_outbox = false,
+               .out_is_imm = false,
+               .encode_slave_id = false,
+               .verify = NULL,
+               .wrapper = mlx4_CMD_EPERM_wrapper
+       },
        {
                .opcode = MLX4_CMD_QUERY_IF_STAT,
                .has_inbox = false,
index 688e1eabab29d697137477ea4a05b1dbb46a6d1f..494753e44ae3a1324fac4a269f60868766926717 100644 (file)
@@ -136,7 +136,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
                [7] = "FSM (MAC anti-spoofing) support",
                [8] = "Dynamic QP updates support",
                [9] = "Device managed flow steering IPoIB support",
-               [10] = "TCP/IP offloads/flow-steering for VXLAN support"
+               [10] = "TCP/IP offloads/flow-steering for VXLAN support",
+               [11] = "MAD DEMUX (Secure-Host) support"
        };
        int i;
 
@@ -571,6 +572,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET                0xa0
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC          0x9d
 #define QUERY_DEV_CAP_VXLAN                    0x9e
+#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET         0xb0
 
        dev_cap->flags2 = 0;
        mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -748,6 +750,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                MLX4_GET(dev_cap->max_counters, outbox,
                         QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
 
+       MLX4_GET(field32, outbox,
+                QUERY_DEV_CAP_MAD_DEMUX_OFFSET);
+       if (field32 & (1 << 0))
+               dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
+
        MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
        if (field32 & (1 << 16))
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
@@ -2016,3 +2023,85 @@ void mlx4_opreq_action(struct work_struct *work)
 out:
        mlx4_free_cmd_mailbox(dev, mailbox);
 }
+
+static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev,
+                                         struct mlx4_cmd_mailbox *mailbox)
+{
+#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET             0x10
+#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET         0x20
+#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET            0x40
+#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET    0x70
+
+       u32 set_attr_mask, getresp_attr_mask;
+       u32 trap_attr_mask, traprepress_attr_mask;
+
+       MLX4_GET(set_attr_mask, mailbox->buf,
+                MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET);
+       mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n",
+                set_attr_mask);
+
+       MLX4_GET(getresp_attr_mask, mailbox->buf,
+                MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET);
+       mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n",
+                getresp_attr_mask);
+
+       MLX4_GET(trap_attr_mask, mailbox->buf,
+                MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET);
+       mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n",
+                trap_attr_mask);
+
+       MLX4_GET(traprepress_attr_mask, mailbox->buf,
+                MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET);
+       mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n",
+                traprepress_attr_mask);
+
+       if (set_attr_mask && getresp_attr_mask && trap_attr_mask &&
+           traprepress_attr_mask)
+               return 1;
+
+       return 0;
+}
+
+int mlx4_config_mad_demux(struct mlx4_dev *dev)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       int secure_host_active;
+       int err;
+
+       /* Check if mad_demux is supported */
+       if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX))
+               return 0;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox)) {
+               mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX");
+               return -ENOMEM;
+       }
+
+       /* Query mad_demux to find out which MADs are handled by internal sma */
+       err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */,
+                          MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX,
+                          MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+       if (err) {
+               mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n",
+                         err);
+               goto out;
+       }
+
+       secure_host_active = mlx4_check_smp_firewall_active(dev, mailbox);
+
+       /* Config mad_demux to handle all MADs returned by the query above */
+       err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */,
+                      MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX,
+                      MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+       if (err) {
+               mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err);
+               goto out;
+       }
+
+       if (secure_host_active)
+               mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n");
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
index 80b8c5f30e4e709648282cc88cf29df7438c7bdd..0158689906fd69c9035c34f46b6a104279c14636 100644 (file)
@@ -1853,6 +1853,11 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
                        mlx4_err(dev, "Failed to initialize multicast group table, aborting\n");
                        goto err_mr_table_free;
                }
+               err = mlx4_config_mad_demux(dev);
+               if (err) {
+                       mlx4_err(dev, "Failed in config_mad_demux, aborting\n");
+                       goto err_mcg_table_free;
+               }
        }
 
        err = mlx4_init_eq_table(dev);
index 13fbcd03c3e414674ac37b4737d4fedc3998b264..b508c7887ef84999c92bc4e959134cf0bf6ffaf3 100644 (file)
@@ -274,6 +274,8 @@ struct mlx4_icm_table {
 #define MLX4_MPT_FLAG_PHYSICAL     (1 <<  9)
 #define MLX4_MPT_FLAG_REGION       (1 <<  8)
 
+#define MLX4_MPT_PD_MASK           (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK        (0xFE0000UL)
 #define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
 #define MLX4_MPT_PD_FLAG_RAE       (1 << 28)
 #define MLX4_MPT_PD_FLAG_EN_INV            (3 << 24)
@@ -1306,5 +1308,6 @@ void mlx4_init_quotas(struct mlx4_dev *dev);
 int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
 /* Returns the VF index of slave */
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
+int mlx4_config_mad_demux(struct mlx4_dev *dev);
 
 #endif /* MLX4_H */
index 2839abb878a6ae7d9cc0809de20b621e1865cbbe..7d717eccb7b042284cc861fa8da2af0d63addb11 100644 (file)
@@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
                            MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+                      struct mlx4_mpt_entry ***mpt_entry)
+{
+       int err;
+       int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+       struct mlx4_cmd_mailbox *mailbox = NULL;
+
+       /* Make sure that at this point we have single-threaded access only */
+
+       if (mmr->enabled != MLX4_MPT_EN_HW)
+               return -EINVAL;
+
+       err = mlx4_HW2SW_MPT(dev, NULL, key);
+
+       if (err) {
+               mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+               mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+               return err;
+       }
+
+       mmr->enabled = MLX4_MPT_EN_SW;
+
+       if (!mlx4_is_mfunc(dev)) {
+               **mpt_entry = mlx4_table_find(
+                               &mlx4_priv(dev)->mr_table.dmpt_table,
+                               key, NULL);
+       } else {
+               mailbox = mlx4_alloc_cmd_mailbox(dev);
+               if (IS_ERR_OR_NULL(mailbox))
+                       return PTR_ERR(mailbox);
+
+               err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+                                  0, MLX4_CMD_QUERY_MPT,
+                                  MLX4_CMD_TIME_CLASS_B,
+                                  MLX4_CMD_WRAPPED);
+
+               if (err)
+                       goto free_mailbox;
+
+               *mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+       }
+
+       if (!(*mpt_entry) || !(**mpt_entry)) {
+               err = -ENOMEM;
+               goto free_mailbox;
+       }
+
+       return 0;
+
+free_mailbox:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+                        struct mlx4_mpt_entry **mpt_entry)
+{
+       int err;
+
+       if (!mlx4_is_mfunc(dev)) {
+               /* Make sure any changes to this entry are flushed */
+               wmb();
+
+               *(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+               /* Make sure the new status is written */
+               wmb();
+
+               err = mlx4_SYNC_TPT(dev);
+       } else {
+               int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+               struct mlx4_cmd_mailbox *mailbox =
+                       container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+                                    buf);
+
+               err = mlx4_SW2HW_MPT(dev, mailbox, key);
+       }
+
+       mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+       if (!err)
+               mmr->enabled = MLX4_MPT_EN_HW;
+       return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+                       struct mlx4_mpt_entry **mpt_entry)
+{
+       if (mlx4_is_mfunc(dev)) {
+               struct mlx4_cmd_mailbox *mailbox =
+                       container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+                                    buf);
+               mlx4_free_cmd_mailbox(dev, mailbox);
+       }
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+                        u32 pdn)
+{
+       u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags);
+       /* The wrapper function will put the slave's id here */
+       if (mlx4_is_mfunc(dev))
+               pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+       mpt_entry->pd_flags = cpu_to_be32((pd_flags &  ~MLX4_MPT_PD_MASK) |
+                                         (pdn & MLX4_MPT_PD_MASK)
+                                         | MLX4_MPT_PD_FLAG_EN_INV);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+                            struct mlx4_mpt_entry *mpt_entry,
+                            u32 access)
+{
+       u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+                   (access & MLX4_PERM_MASK);
+
+       mpt_entry->flags = cpu_to_be32(flags);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
 static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
                           u64 iova, u64 size, u32 access, int npages,
                           int page_shift, struct mlx4_mr *mr)
@@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_free);
 
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+       mlx4_mtt_cleanup(dev, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+                           u64 iova, u64 size, int npages,
+                           int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+       int err;
+
+       mpt_entry->start       = cpu_to_be64(mr->iova);
+       mpt_entry->length      = cpu_to_be64(mr->size);
+       mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+       err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+       if (err)
+               return err;
+
+       if (mr->mtt.order < 0) {
+               mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+               mpt_entry->mtt_addr = 0;
+       } else {
+               mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+                                                 &mr->mtt));
+               if (mr->mtt.page_shift == 0)
+                       mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+       }
+       mr->enabled = MLX4_MPT_EN_SW;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
 int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
        struct mlx4_cmd_mailbox *mailbox;
index 0efc1368e5a8c4d25b4c95edf049fcae1bec0efb..1089367fed22632e087b2da0bcd150cd1dce6f48 100644 (file)
@@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
        if (err)
                return err;
 
-       if (mpt->com.from_state != RES_MPT_HW) {
+       if (mpt->com.from_state == RES_MPT_MAPPED) {
+               /* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+                * that, the VF must read the MPT. But since the MPT entry memory is not
+                * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+                * entry contents. To guarantee that the MPT cannot be changed, the driver
+                * must perform HW2SW_MPT before this query and return the MPT entry to HW
+                * ownership fofollowing the change. The change here allows the VF to
+                * perform QUERY_MPT also when the entry is in SW ownership.
+                */
+               struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+                                       &mlx4_priv(dev)->mr_table.dmpt_table,
+                                       mpt->key, NULL);
+
+               if (NULL == mpt_entry || NULL == outbox->buf) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+               err = 0;
+       } else if (mpt->com.from_state == RES_MPT_HW) {
+               err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+       } else {
                err = -EBUSY;
                goto out;
        }
 
-       err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 
 out:
        put_res(dev, slave, id, RES_MPT);
index 69c26f04d8ce06aa0b74b944e12ed282e618e7fb..679db026f4befcaed7896429b0076d1219abcd38 100644 (file)
@@ -873,6 +873,10 @@ static int myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
                return -ENOMEM;
        dmatest_bus = pci_map_page(mgp->pdev, dmatest_page, 0, PAGE_SIZE,
                                   DMA_BIDIRECTIONAL);
+       if (unlikely(pci_dma_mapping_error(mgp->pdev, dmatest_bus))) {
+               __free_page(dmatest_page);
+               return -ENOMEM;
+       }
 
        /* Run a small DMA test.
         * The magic multipliers to the length tell the firmware
@@ -1294,6 +1298,7 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
                        int bytes, int watchdog)
 {
        struct page *page;
+       dma_addr_t bus;
        int idx;
 #if MYRI10GE_ALLOC_SIZE > 4096
        int end_offset;
@@ -1318,11 +1323,21 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
                                        rx->watchdog_needed = 1;
                                return;
                        }
+
+                       bus = pci_map_page(mgp->pdev, page, 0,
+                                          MYRI10GE_ALLOC_SIZE,
+                                          PCI_DMA_FROMDEVICE);
+                       if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) {
+                               __free_pages(page, MYRI10GE_ALLOC_ORDER);
+                               if (rx->fill_cnt - rx->cnt < 16)
+                                       rx->watchdog_needed = 1;
+                               return;
+                       }
+
                        rx->page = page;
                        rx->page_offset = 0;
-                       rx->bus = pci_map_page(mgp->pdev, page, 0,
-                                              MYRI10GE_ALLOC_SIZE,
-                                              PCI_DMA_FROMDEVICE);
+                       rx->bus = bus;
+
                }
                rx->info[idx].page = rx->page;
                rx->info[idx].page_offset = rx->page_offset;
@@ -2764,6 +2779,35 @@ myri10ge_submit_req(struct myri10ge_tx_buf *tx, struct mcp_kreq_ether_send *src,
        mb();
 }
 
+static void myri10ge_unmap_tx_dma(struct myri10ge_priv *mgp,
+                                 struct myri10ge_tx_buf *tx, int idx)
+{
+       unsigned int len;
+       int last_idx;
+
+       /* Free any DMA resources we've alloced and clear out the skb slot */
+       last_idx = (idx + 1) & tx->mask;
+       idx = tx->req & tx->mask;
+       do {
+               len = dma_unmap_len(&tx->info[idx], len);
+               if (len) {
+                       if (tx->info[idx].skb != NULL)
+                               pci_unmap_single(mgp->pdev,
+                                                dma_unmap_addr(&tx->info[idx],
+                                                               bus), len,
+                                                PCI_DMA_TODEVICE);
+                       else
+                               pci_unmap_page(mgp->pdev,
+                                              dma_unmap_addr(&tx->info[idx],
+                                                             bus), len,
+                                              PCI_DMA_TODEVICE);
+                       dma_unmap_len_set(&tx->info[idx], len, 0);
+                       tx->info[idx].skb = NULL;
+               }
+               idx = (idx + 1) & tx->mask;
+       } while (idx != last_idx);
+}
+
 /*
  * Transmit a packet.  We need to split the packet so that a single
  * segment does not cross myri10ge->tx_boundary, so this makes segment
@@ -2787,7 +2831,7 @@ static netdev_tx_t myri10ge_xmit(struct sk_buff *skb,
        u32 low;
        __be32 high_swapped;
        unsigned int len;
-       int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
+       int idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
        u16 pseudo_hdr_offset, cksum_offset, queue;
        int cum_len, seglen, boundary, rdma_count;
        u8 flags, odd_flag;
@@ -2884,9 +2928,12 @@ again:
 
        /* map the skb for DMA */
        len = skb_headlen(skb);
+       bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE);
+       if (unlikely(pci_dma_mapping_error(mgp->pdev, bus)))
+               goto drop;
+
        idx = tx->req & tx->mask;
        tx->info[idx].skb = skb;
-       bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE);
        dma_unmap_addr_set(&tx->info[idx], bus, bus);
        dma_unmap_len_set(&tx->info[idx], len, len);
 
@@ -2985,12 +3032,16 @@ again:
                        break;
 
                /* map next fragment for DMA */
-               idx = (count + tx->req) & tx->mask;
                frag = &skb_shinfo(skb)->frags[frag_idx];
                frag_idx++;
                len = skb_frag_size(frag);
                bus = skb_frag_dma_map(&mgp->pdev->dev, frag, 0, len,
                                       DMA_TO_DEVICE);
+               if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) {
+                       myri10ge_unmap_tx_dma(mgp, tx, idx);
+                       goto drop;
+               }
+               idx = (count + tx->req) & tx->mask;
                dma_unmap_addr_set(&tx->info[idx], bus, bus);
                dma_unmap_len_set(&tx->info[idx], len, len);
        }
@@ -3021,31 +3072,8 @@ again:
        return NETDEV_TX_OK;
 
 abort_linearize:
-       /* Free any DMA resources we've alloced and clear out the skb
-        * slot so as to not trip up assertions, and to avoid a
-        * double-free if linearizing fails */
+       myri10ge_unmap_tx_dma(mgp, tx, idx);
 
-       last_idx = (idx + 1) & tx->mask;
-       idx = tx->req & tx->mask;
-       tx->info[idx].skb = NULL;
-       do {
-               len = dma_unmap_len(&tx->info[idx], len);
-               if (len) {
-                       if (tx->info[idx].skb != NULL)
-                               pci_unmap_single(mgp->pdev,
-                                                dma_unmap_addr(&tx->info[idx],
-                                                               bus), len,
-                                                PCI_DMA_TODEVICE);
-                       else
-                               pci_unmap_page(mgp->pdev,
-                                              dma_unmap_addr(&tx->info[idx],
-                                                             bus), len,
-                                              PCI_DMA_TODEVICE);
-                       dma_unmap_len_set(&tx->info[idx], len, 0);
-                       tx->info[idx].skb = NULL;
-               }
-               idx = (idx + 1) & tx->mask;
-       } while (idx != last_idx);
        if (skb_is_gso(skb)) {
                netdev_err(mgp->dev, "TSO but wanted to linearize?!?!?\n");
                goto drop;
index d813bfb1a847b527d27db6709eac8baac96a2513..23c89ab5a6ada1ade365939dcd7bb28da44ca736 100644 (file)
@@ -32,6 +32,11 @@ MODULE_DESCRIPTION("Sun LDOM virtual network driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
+/* Heuristic for the number of times to exponentially backoff and
+ * retry sending an LDC trigger when EAGAIN is encountered
+ */
+#define        VNET_MAX_RETRIES        10
+
 /* Ordered from largest major to lowest */
 static struct vio_version vnet_versions[] = {
        { .major = 1, .minor = 0 },
@@ -260,6 +265,7 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
                .state                  = vio_dring_state,
        };
        int err, delay;
+       int retries = 0;
 
        hdr.seq = dr->snd_nxt;
        delay = 1;
@@ -272,6 +278,13 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
                udelay(delay);
                if ((delay <<= 1) > 128)
                        delay = 128;
+               if (retries++ > VNET_MAX_RETRIES) {
+                       pr_info("ECONNRESET %x:%x:%x:%x:%x:%x\n",
+                               port->raddr[0], port->raddr[1],
+                               port->raddr[2], port->raddr[3],
+                               port->raddr[4], port->raddr[5]);
+                       err = -ECONNRESET;
+               }
        } while (err == -EAGAIN);
 
        return err;
@@ -475,8 +488,9 @@ static int handle_mcast(struct vnet_port *port, void *msgbuf)
        return 0;
 }
 
-static void maybe_tx_wakeup(struct vnet *vp)
+static void maybe_tx_wakeup(unsigned long param)
 {
+       struct vnet *vp = (struct vnet *)param;
        struct net_device *dev = vp->dev;
 
        netif_tx_lock(dev);
@@ -573,8 +587,13 @@ static void vnet_event(void *arg, int event)
                        break;
        }
        spin_unlock(&vio->lock);
+       /* Kick off a tasklet to wake the queue.  We cannot call
+        * maybe_tx_wakeup directly here because we could deadlock on
+        * netif_tx_lock() with dev_watchdog()
+        */
        if (unlikely(tx_wakeup && err != -ECONNRESET))
-               maybe_tx_wakeup(port->vp);
+               tasklet_schedule(&port->vp->vnet_tx_wakeup);
+
        local_irq_restore(flags);
 }
 
@@ -593,6 +612,7 @@ static int __vnet_tx_trigger(struct vnet_port *port)
                .end_idx                = (u32) -1,
        };
        int err, delay;
+       int retries = 0;
 
        hdr.seq = dr->snd_nxt;
        delay = 1;
@@ -605,6 +625,8 @@ static int __vnet_tx_trigger(struct vnet_port *port)
                udelay(delay);
                if ((delay <<= 1) > 128)
                        delay = 128;
+               if (retries++ > VNET_MAX_RETRIES)
+                       break;
        } while (err == -EAGAIN);
 
        return err;
@@ -691,7 +713,15 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
                memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
        }
 
-       d->hdr.ack = VIO_ACK_ENABLE;
+       /* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
+        * thus it is safe to not set VIO_ACK_ENABLE for each transmission:
+        * the protocol itself does not require it as long as the peer
+        * sends a VIO_SUBTYPE_ACK for VIO_DRING_STOPPED.
+        *
+        * An ACK for every packet in the ring is expensive as the
+        * sending of LDC messages is slow and affects performance.
+        */
+       d->hdr.ack = VIO_ACK_DISABLE;
        d->size = len;
        d->ncookies = port->tx_bufs[dr->prod].ncookies;
        for (i = 0; i < d->ncookies; i++)
@@ -1046,6 +1076,7 @@ static struct vnet *vnet_new(const u64 *local_mac)
        vp = netdev_priv(dev);
 
        spin_lock_init(&vp->lock);
+       tasklet_init(&vp->vnet_tx_wakeup, maybe_tx_wakeup, (unsigned long)vp);
        vp->dev = dev;
 
        INIT_LIST_HEAD(&vp->port_list);
@@ -1105,6 +1136,7 @@ static void vnet_cleanup(void)
                vp = list_first_entry(&vnet_list, struct vnet, list);
                list_del(&vp->list);
                dev = vp->dev;
+               tasklet_kill(&vp->vnet_tx_wakeup);
                /* vio_unregister_driver() should have cleaned up port_list */
                BUG_ON(!list_empty(&vp->port_list));
                unregister_netdev(dev);
index d347a5bf24b00d1f69e50dc78106086ae3e99eac..de5c2c64996f34c5eb76e5376a71d93b72ebf490 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _SUNVNET_H
 #define _SUNVNET_H
 
+#include <linux/interrupt.h>
+
 #define DESC_NCOOKIES(entry_size)      \
        ((entry_size) - sizeof(struct vio_net_desc))
 
@@ -78,6 +80,8 @@ struct vnet {
 
        struct list_head        list;
        u64                     local_mac;
+
+       struct tasklet_struct   vnet_tx_wakeup;
 };
 
 #endif /* _SUNVNET_H */
index 36f4459520c366a88a33c9cf2663bf9e713955c3..fda5891835d4c8ea3b2fad1725d542dc0dacff10 100644 (file)
@@ -1170,7 +1170,6 @@ static struct platform_driver temac_of_driver = {
        .probe = temac_of_probe,
        .remove = temac_of_remove,
        .driver = {
-               .owner = THIS_MODULE,
                .name = "xilinx_temac",
                .of_match_table = temac_of_match,
        },
index 30e8608ff05079c3083ba2a6783070e056e36571..c8fd94133ecd22380cf047bf44a3e21064f5154f 100644 (file)
@@ -1645,7 +1645,6 @@ static struct platform_driver axienet_of_driver = {
        .probe = axienet_of_probe,
        .remove = axienet_of_remove,
        .driver = {
-                .owner = THIS_MODULE,
                 .name = "xilinx_axienet",
                 .of_match_table = axienet_of_match,
        },
index 782bb9373cd817e366bc1ba914659285863bd4e4..28dbbdc393ebf251cb782ff2ee4b5d276fa99d87 100644 (file)
@@ -1245,7 +1245,6 @@ MODULE_DEVICE_TABLE(of, xemaclite_of_match);
 static struct platform_driver xemaclite_of_driver = {
        .driver = {
                .name = DRIVER_NAME,
-               .owner = THIS_MODULE,
                .of_match_table = xemaclite_of_match,
        },
        .probe          = xemaclite_of_probe,
index 768dfe9a93159e4964d4b9c0bc6986da2e6c09b6..6d3e2093bf7f5575ce4360af9e0bdc7763bd11b1 100644 (file)
@@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = {
        .resume         = toshoboe_wakeup 
 };
 
-static int __init
-donauboe_init (void)
-{
-  return pci_register_driver(&donauboe_pci_driver);
-}
-
-static void __exit
-donauboe_cleanup (void)
-{
-  pci_unregister_driver(&donauboe_pci_driver);
-}
-
-module_init(donauboe_init);
-module_exit(donauboe_cleanup);
+module_pci_driver(donauboe_pci_driver);
index ef8a5c20236a007b976f98e4c787b5520f9affd8..60e4ca01ccbb7d5c2fc26bc8fdcc26a49f943c17 100644 (file)
@@ -45,10 +45,9 @@ struct macvlan_port {
        struct sk_buff_head     bc_queue;
        struct work_struct      bc_work;
        bool                    passthru;
+       int                     count;
 };
 
-#define MACVLAN_PORT_IS_EMPTY(port)    list_empty(&port->vlans)
-
 struct macvlan_skb_cb {
        const struct macvlan_dev *src;
 };
@@ -667,7 +666,8 @@ static void macvlan_uninit(struct net_device *dev)
 
        free_percpu(vlan->pcpu_stats);
 
-       if (MACVLAN_PORT_IS_EMPTY(port))
+       port->count -= 1;
+       if (!port->count)
                macvlan_port_destroy(port->dev);
 }
 
@@ -1020,12 +1020,13 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
                vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
 
        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
-               if (!MACVLAN_PORT_IS_EMPTY(port))
+               if (port->count)
                        return -EINVAL;
                port->passthru = true;
                eth_hw_addr_inherit(dev, lowerdev);
        }
 
+       port->count += 1;
        err = register_netdevice(dev);
        if (err < 0)
                goto destroy_port;
@@ -1043,7 +1044,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 unregister_netdev:
        unregister_netdevice(dev);
 destroy_port:
-       if (MACVLAN_PORT_IS_EMPTY(port))
+       port->count -= 1;
+       if (!port->count)
                macvlan_port_destroy(lowerdev);
 
        return err;
index 8596aba34f96854dfe1700da64b44ef832e5fc9a..237d0cda1bcb053b40f994a9cbfcee51db3347ab 100644 (file)
@@ -256,6 +256,7 @@ struct ar9170 {
        atomic_t rx_work_urbs;
        atomic_t rx_pool_urbs;
        kernel_ulong_t features;
+       bool usb_ep_cmd_is_bulk;
 
        /* firmware settings */
        struct completion fw_load_wait;
index f35c7f30f9a6f66f234a4816260d0ea473fc573e..c9f93310c0d6c0abaffc780fd4c30b970a420c54 100644 (file)
@@ -621,9 +621,16 @@ int __carl9170_exec_cmd(struct ar9170 *ar, struct carl9170_cmd *cmd,
                goto err_free;
        }
 
-       usb_fill_int_urb(urb, ar->udev, usb_sndintpipe(ar->udev,
-               AR9170_USB_EP_CMD), cmd, cmd->hdr.len + 4,
-               carl9170_usb_cmd_complete, ar, 1);
+       if (ar->usb_ep_cmd_is_bulk)
+               usb_fill_bulk_urb(urb, ar->udev,
+                                 usb_sndbulkpipe(ar->udev, AR9170_USB_EP_CMD),
+                                 cmd, cmd->hdr.len + 4,
+                                 carl9170_usb_cmd_complete, ar);
+       else
+               usb_fill_int_urb(urb, ar->udev,
+                                usb_sndintpipe(ar->udev, AR9170_USB_EP_CMD),
+                                cmd, cmd->hdr.len + 4,
+                                carl9170_usb_cmd_complete, ar, 1);
 
        if (free_buf)
                urb->transfer_flags |= URB_FREE_BUFFER;
@@ -1032,9 +1039,10 @@ static void carl9170_usb_firmware_step2(const struct firmware *fw,
 static int carl9170_usb_probe(struct usb_interface *intf,
                              const struct usb_device_id *id)
 {
+       struct usb_endpoint_descriptor *ep;
        struct ar9170 *ar;
        struct usb_device *udev;
-       int err;
+       int i, err;
 
        err = usb_reset_device(interface_to_usbdev(intf));
        if (err)
@@ -1050,6 +1058,21 @@ static int carl9170_usb_probe(struct usb_interface *intf,
        ar->intf = intf;
        ar->features = id->driver_info;
 
+       /* We need to remember the type of endpoint 4 because it differs
+        * between high- and full-speed configuration. The high-speed
+        * configuration specifies it as interrupt and the full-speed
+        * configuration as bulk endpoint. This information is required
+        * later when sending urbs to that endpoint.
+        */
+       for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; ++i) {
+               ep = &intf->cur_altsetting->endpoint[i].desc;
+
+               if (usb_endpoint_num(ep) == AR9170_USB_EP_CMD &&
+                   usb_endpoint_dir_out(ep) &&
+                   usb_endpoint_type(ep) == USB_ENDPOINT_XFER_BULK)
+                       ar->usb_ep_cmd_is_bulk = true;
+       }
+
        usb_set_intfdata(intf, ar);
        SET_IEEE80211_DEV(ar->hw, &intf->dev);
 
index 535c7eb01b3a423deb264c8db55a2858306b67ee..8f8b9373de95b6b2de89c017c2983d3abb247e98 100644 (file)
@@ -1318,6 +1318,8 @@ int brcmf_proto_msgbuf_attach(struct brcmf_pub *drvr)
        msgbuf->nrof_flowrings = if_msgbuf->nrof_flowrings;
        msgbuf->flowring_dma_handle = kzalloc(msgbuf->nrof_flowrings *
                sizeof(*msgbuf->flowring_dma_handle), GFP_ATOMIC);
+       if (!msgbuf->flowring_dma_handle)
+               goto fail;
 
        msgbuf->rx_dataoffset = if_msgbuf->rx_dataoffset;
        msgbuf->max_rxbufpost = if_msgbuf->max_rxbufpost;
@@ -1362,6 +1364,7 @@ fail:
                kfree(msgbuf->flow_map);
                kfree(msgbuf->txstatus_done_map);
                brcmf_msgbuf_release_pktids(msgbuf);
+               kfree(msgbuf->flowring_dma_handle);
                if (msgbuf->ioctbuf)
                        dma_free_coherent(drvr->bus_if->dev,
                                          BRCMF_TX_IOCTL_MAX_MSG_SIZE,
@@ -1391,6 +1394,7 @@ void brcmf_proto_msgbuf_detach(struct brcmf_pub *drvr)
                                  BRCMF_TX_IOCTL_MAX_MSG_SIZE,
                                  msgbuf->ioctbuf, msgbuf->ioctbuf_handle);
                brcmf_msgbuf_release_pktids(msgbuf);
+               kfree(msgbuf->flowring_dma_handle);
                kfree(msgbuf);
                drvr->proto->pd = NULL;
        }
index bc972c0ba5f89a2e4a9f0ca44902bf40e4de16cf..e5101b287e4eebd944e516d6b9a83a09ba2fa7e5 100644 (file)
@@ -591,12 +591,13 @@ static void brcmf_pcie_handle_mb_data(struct brcmf_pciedev_info *devinfo)
        }
        if (dtoh_mb_data & BRCMF_D2H_DEV_DS_EXIT_NOTE)
                brcmf_dbg(PCIE, "D2H_MB_DATA: DEEP SLEEP EXIT\n");
-       if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK)
+       if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK) {
                brcmf_dbg(PCIE, "D2H_MB_DATA: D3 ACK\n");
                if (waitqueue_active(&devinfo->mbdata_resp_wait)) {
                        devinfo->mbdata_completed = true;
                        wake_up(&devinfo->mbdata_resp_wait);
                }
+       }
 }
 
 
index c5aa404069f3b14d4c9bba36987faeed088cba58..389656bd1a742c3e90e4ff5dda1bd022a0df54ac 100644 (file)
@@ -9853,6 +9853,7 @@ static int ipw_wx_get_wireless_mode(struct net_device *dev,
                strncpy(extra, "unknown", MAX_WX_STRING);
                break;
        }
+       extra[MAX_WX_STRING - 1] = '\0';
 
        IPW_DEBUG_WX("PRIV GET MODE: %s\n", extra);
 
index 0d6a8b768a686fb4630528682635c7ee865ce97c..7c8796584c253d322291ec02cd177539a3869a14 100644 (file)
@@ -396,7 +396,8 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm)
        else
                hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT;
 
-       hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN;
+       /* TODO: enable that only for firmwares that don't crash */
+       /* hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; */
        hw->wiphy->max_sched_scan_ssids = PROBE_OPTION_MAX;
        hw->wiphy->max_match_sets = IWL_SCAN_MAX_PROFILES;
        /* we create the 802.11 header and zero length SSID IE. */
index ef3026f46a37834a907919e614719f081c40ff7f..d4eb8d2e9cb7f7f2bc0b3816dd788e6eb01c7927 100644 (file)
@@ -165,6 +165,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */
        u16 dealloc_ring[MAX_PENDING_REQS];
        struct task_struct *dealloc_task;
        wait_queue_head_t dealloc_wq;
+       atomic_t inflight_packets;
 
        /* Use kthread for guest RX */
        struct task_struct *task;
@@ -329,4 +330,8 @@ extern unsigned int xenvif_max_queues;
 extern struct dentry *xen_netback_dbg_root;
 #endif
 
+void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue,
+                                struct sk_buff *skb);
+void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue);
+
 #endif /* __XEN_NETBACK__COMMON_H__ */
index bfd10cb9c8def782e412d6317f4b5b2cb23d0ef6..e29e15dca86ee3b7d1efc6d84a2607bcd173b449 100644 (file)
 #define XENVIF_QUEUE_LENGTH 32
 #define XENVIF_NAPI_WEIGHT  64
 
+/* This function is used to set SKBTX_DEV_ZEROCOPY as well as
+ * increasing the inflight counter. We need to increase the inflight
+ * counter because core driver calls into xenvif_zerocopy_callback
+ * which calls xenvif_skb_zerocopy_complete.
+ */
+void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue,
+                                struct sk_buff *skb)
+{
+       skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+       atomic_inc(&queue->inflight_packets);
+}
+
+void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue)
+{
+       atomic_dec(&queue->inflight_packets);
+}
+
 static inline void xenvif_stop_queue(struct xenvif_queue *queue)
 {
        struct net_device *dev = queue->vif->dev;
@@ -524,9 +541,6 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 
        init_timer(&queue->rx_stalled);
 
-       netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
-                       XENVIF_NAPI_WEIGHT);
-
        return 0;
 }
 
@@ -560,6 +574,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
 
        init_waitqueue_head(&queue->wq);
        init_waitqueue_head(&queue->dealloc_wq);
+       atomic_set(&queue->inflight_packets, 0);
 
        if (tx_evtchn == rx_evtchn) {
                /* feature-split-event-channels == 0 */
@@ -614,6 +629,9 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
        wake_up_process(queue->task);
        wake_up_process(queue->dealloc_task);
 
+       netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
+                       XENVIF_NAPI_WEIGHT);
+
        return 0;
 
 err_rx_unbind:
@@ -642,25 +660,6 @@ void xenvif_carrier_off(struct xenvif *vif)
        rtnl_unlock();
 }
 
-static void xenvif_wait_unmap_timeout(struct xenvif_queue *queue,
-                                     unsigned int worst_case_skb_lifetime)
-{
-       int i, unmap_timeout = 0;
-
-       for (i = 0; i < MAX_PENDING_REQS; ++i) {
-               if (queue->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
-                       unmap_timeout++;
-                       schedule_timeout(msecs_to_jiffies(1000));
-                       if (unmap_timeout > worst_case_skb_lifetime &&
-                           net_ratelimit())
-                               netdev_err(queue->vif->dev,
-                                          "Page still granted! Index: %x\n",
-                                          i);
-                       i = -1;
-               }
-       }
-}
-
 void xenvif_disconnect(struct xenvif *vif)
 {
        struct xenvif_queue *queue = NULL;
@@ -672,6 +671,8 @@ void xenvif_disconnect(struct xenvif *vif)
        for (queue_index = 0; queue_index < num_queues; ++queue_index) {
                queue = &vif->queues[queue_index];
 
+               netif_napi_del(&queue->napi);
+
                if (queue->task) {
                        del_timer_sync(&queue->rx_stalled);
                        kthread_stop(queue->task);
@@ -704,7 +705,6 @@ void xenvif_disconnect(struct xenvif *vif)
 void xenvif_deinit_queue(struct xenvif_queue *queue)
 {
        free_xenballooned_pages(MAX_PENDING_REQS, queue->mmap_pages);
-       netif_napi_del(&queue->napi);
 }
 
 void xenvif_free(struct xenvif *vif)
@@ -712,21 +712,11 @@ void xenvif_free(struct xenvif *vif)
        struct xenvif_queue *queue = NULL;
        unsigned int num_queues = vif->num_queues;
        unsigned int queue_index;
-       /* Here we want to avoid timeout messages if an skb can be legitimately
-        * stuck somewhere else. Realistically this could be an another vif's
-        * internal or QDisc queue. That another vif also has this
-        * rx_drain_timeout_msecs timeout, so give it time to drain out.
-        * Although if that other guest wakes up just before its timeout happens
-        * and takes only one skb from QDisc, it can hold onto other skbs for a
-        * longer period.
-        */
-       unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000);
 
        unregister_netdev(vif->dev);
 
        for (queue_index = 0; queue_index < num_queues; ++queue_index) {
                queue = &vif->queues[queue_index];
-               xenvif_wait_unmap_timeout(queue, worst_case_skb_lifetime);
                xenvif_deinit_queue(queue);
        }
 
index 4734472aa6201a5dac4ac8c0f42e4faba30dcf88..08f65996534cbd1b861c0031b70ebd6d78d47720 100644 (file)
@@ -1525,10 +1525,12 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s
        /* remove traces of mapped pages and frag_list */
        skb_frag_list_init(skb);
        uarg = skb_shinfo(skb)->destructor_arg;
+       /* increase inflight counter to offset decrement in callback */
+       atomic_inc(&queue->inflight_packets);
        uarg->callback(uarg, true);
        skb_shinfo(skb)->destructor_arg = NULL;
 
-       skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+       xenvif_skb_zerocopy_prepare(queue, nskb);
        kfree_skb(nskb);
 
        return 0;
@@ -1589,7 +1591,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
                                if (net_ratelimit())
                                        netdev_err(queue->vif->dev,
                                                   "Not enough memory to consolidate frag_list!\n");
-                               skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+                               xenvif_skb_zerocopy_prepare(queue, skb);
                                kfree_skb(skb);
                                continue;
                        }
@@ -1609,7 +1611,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
                                   "Can't setup checksum in net_tx_action\n");
                        /* We have to set this flag to trigger the callback */
                        if (skb_shinfo(skb)->destructor_arg)
-                               skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+                               xenvif_skb_zerocopy_prepare(queue, skb);
                        kfree_skb(skb);
                        continue;
                }
@@ -1641,7 +1643,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
                 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
                 */
                if (skb_shinfo(skb)->destructor_arg) {
-                       skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+                       xenvif_skb_zerocopy_prepare(queue, skb);
                        queue->stats.tx_zerocopy_sent++;
                }
 
@@ -1681,6 +1683,7 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
                queue->stats.tx_zerocopy_success++;
        else
                queue->stats.tx_zerocopy_fail++;
+       xenvif_skb_zerocopy_complete(queue);
 }
 
 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
@@ -2058,15 +2061,24 @@ int xenvif_kthread_guest_rx(void *data)
        return 0;
 }
 
+static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
+{
+       /* Dealloc thread must remain running until all inflight
+        * packets complete.
+        */
+       return kthread_should_stop() &&
+               !atomic_read(&queue->inflight_packets);
+}
+
 int xenvif_dealloc_kthread(void *data)
 {
        struct xenvif_queue *queue = data;
 
-       while (!kthread_should_stop()) {
+       for (;;) {
                wait_event_interruptible(queue->dealloc_wq,
                                         tx_dealloc_work_todo(queue) ||
-                                        kthread_should_stop());
-               if (kthread_should_stop())
+                                        xenvif_dealloc_kthread_should_stop(queue));
+               if (xenvif_dealloc_kthread_should_stop(queue))
                        break;
 
                xenvif_tx_dealloc_action(queue);
index 580517d857bf7c753e3a66d6f1da6177f81365c3..9c47b897b6d212c3de827ddde941acb325111546 100644 (file)
@@ -116,6 +116,7 @@ static int xenvif_read_io_ring(struct seq_file *m, void *v)
 }
 
 #define XENVIF_KICK_STR "kick"
+#define BUFFER_SIZE     32
 
 static ssize_t
 xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count,
@@ -124,22 +125,24 @@ xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count,
        struct xenvif_queue *queue =
                ((struct seq_file *)filp->private_data)->private;
        int len;
-       char write[sizeof(XENVIF_KICK_STR)];
+       char write[BUFFER_SIZE];
 
        /* don't allow partial writes and check the length */
        if (*ppos != 0)
                return 0;
-       if (count < sizeof(XENVIF_KICK_STR) - 1)
+       if (count >= sizeof(write))
                return -ENOSPC;
 
        len = simple_write_to_buffer(write,
-                                    sizeof(write),
+                                    sizeof(write) - 1,
                                     ppos,
                                     buf,
                                     count);
        if (len < 0)
                return len;
 
+       write[len] = '\0';
+
        if (!strncmp(write, XENVIF_KICK_STR, sizeof(XENVIF_KICK_STR) - 1))
                xenvif_interrupt(0, (void *)queue);
        else {
@@ -171,10 +174,9 @@ static const struct file_operations xenvif_dbg_io_ring_ops_fops = {
        .write = xenvif_write_io_ring,
 };
 
-static void xenvif_debugfs_addif(struct xenvif_queue *queue)
+static void xenvif_debugfs_addif(struct xenvif *vif)
 {
        struct dentry *pfile;
-       struct xenvif *vif = queue->vif;
        int i;
 
        if (IS_ERR_OR_NULL(xen_netback_dbg_root))
@@ -733,10 +735,11 @@ static void connect(struct backend_info *be)
                        be->vif->num_queues = queue_index;
                        goto err;
                }
+       }
+
 #ifdef CONFIG_DEBUG_FS
-               xenvif_debugfs_addif(queue);
+       xenvif_debugfs_addif(be->vif);
 #endif /* CONFIG_DEBUG_FS */
-       }
 
        /* Initialisation completed, tell core driver the number of
         * active queues.
index 2dcb0541012d0133a417cc18ff2cd7ec2acbee2f..5160c4eb73c2e3788cd310693be3969b88642ea9 100644 (file)
@@ -9,7 +9,8 @@ menu "Device Tree and Open Firmware support"
 
 config OF_SELFTEST
        bool "Device Tree Runtime self tests"
-       depends on OF_IRQ
+       depends on OF_IRQ && OF_EARLY_FLATTREE
+       select OF_DYNAMIC
        help
          This option builds in test cases for the device tree infrastructure
          that are executed once at boot time, and the results dumped to the
index 099b1fb00af4ac6fd09be4341919cfeb84e45634..2b6a7b129d10b2092cb9e891158908c86c82fcfa 100644 (file)
@@ -1,11 +1,13 @@
 obj-y = base.o device.o platform.o
+obj-$(CONFIG_OF_DYNAMIC) += dynamic.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
 obj-$(CONFIG_OF_EARLY_FLATTREE) += fdt_address.o
 obj-$(CONFIG_OF_PROMTREE) += pdt.o
 obj-$(CONFIG_OF_ADDRESS)  += address.o
 obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_NET)   += of_net.o
-obj-$(CONFIG_OF_SELFTEST) += selftest.o
+obj-$(CONFIG_OF_SELFTEST) += of_selftest.o
+of_selftest-objs := selftest.o testcase-data/testcases.dtb.o
 obj-$(CONFIG_OF_MDIO)  += of_mdio.o
 obj-$(CONFIG_OF_PCI)   += of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
index b9864806e9b811a0c3cc3b0e16404a19fad271eb..d8574adf0d62d446c02011970096e74c921ec6f1 100644 (file)
@@ -17,6 +17,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  */
+#include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
@@ -35,15 +36,17 @@ struct device_node *of_allnodes;
 EXPORT_SYMBOL(of_allnodes);
 struct device_node *of_chosen;
 struct device_node *of_aliases;
-static struct device_node *of_stdout;
+struct device_node *of_stdout;
 
-static struct kset *of_kset;
+struct kset *of_kset;
 
 /*
- * Used to protect the of_aliases; but also overloaded to hold off addition of
- * nodes to sysfs
+ * Used to protect the of_aliases, to hold off addition of nodes to sysfs.
+ * This mutex must be held whenever modifications are being made to the
+ * device tree. The of_{attach,detach}_node() and
+ * of_{add,remove,update}_property() helpers make sure this happens.
  */
-DEFINE_MUTEX(of_aliases_mutex);
+DEFINE_MUTEX(of_mutex);
 
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
@@ -89,79 +92,7 @@ int __weak of_node_to_nid(struct device_node *np)
 }
 #endif
 
-#if defined(CONFIG_OF_DYNAMIC)
-/**
- *     of_node_get - Increment refcount of a node
- *     @node:  Node to inc refcount, NULL is supported to
- *             simplify writing of callers
- *
- *     Returns node.
- */
-struct device_node *of_node_get(struct device_node *node)
-{
-       if (node)
-               kobject_get(&node->kobj);
-       return node;
-}
-EXPORT_SYMBOL(of_node_get);
-
-static inline struct device_node *kobj_to_device_node(struct kobject *kobj)
-{
-       return container_of(kobj, struct device_node, kobj);
-}
-
-/**
- *     of_node_release - release a dynamically allocated node
- *     @kref:  kref element of the node to be released
- *
- *     In of_node_put() this function is passed to kref_put()
- *     as the destructor.
- */
-static void of_node_release(struct kobject *kobj)
-{
-       struct device_node *node = kobj_to_device_node(kobj);
-       struct property *prop = node->properties;
-
-       /* We should never be releasing nodes that haven't been detached. */
-       if (!of_node_check_flag(node, OF_DETACHED)) {
-               pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name);
-               dump_stack();
-               return;
-       }
-
-       if (!of_node_check_flag(node, OF_DYNAMIC))
-               return;
-
-       while (prop) {
-               struct property *next = prop->next;
-               kfree(prop->name);
-               kfree(prop->value);
-               kfree(prop);
-               prop = next;
-
-               if (!prop) {
-                       prop = node->deadprops;
-                       node->deadprops = NULL;
-               }
-       }
-       kfree(node->full_name);
-       kfree(node->data);
-       kfree(node);
-}
-
-/**
- *     of_node_put - Decrement refcount of a node
- *     @node:  Node to dec refcount, NULL is supported to
- *             simplify writing of callers
- *
- */
-void of_node_put(struct device_node *node)
-{
-       if (node)
-               kobject_put(&node->kobj);
-}
-EXPORT_SYMBOL(of_node_put);
-#else
+#ifndef CONFIG_OF_DYNAMIC
 static void of_node_release(struct kobject *kobj)
 {
        /* Without CONFIG_OF_DYNAMIC, no nodes gets freed */
@@ -200,13 +131,16 @@ static const char *safe_name(struct kobject *kobj, const char *orig_name)
        return name;
 }
 
-static int __of_add_property_sysfs(struct device_node *np, struct property *pp)
+int __of_add_property_sysfs(struct device_node *np, struct property *pp)
 {
        int rc;
 
        /* Important: Don't leak passwords */
        bool secure = strncmp(pp->name, "security-", 9) == 0;
 
+       if (!of_kset || !of_node_is_attached(np))
+               return 0;
+
        sysfs_bin_attr_init(&pp->attr);
        pp->attr.attr.name = safe_name(&np->kobj, pp->name);
        pp->attr.attr.mode = secure ? S_IRUSR : S_IRUGO;
@@ -218,12 +152,15 @@ static int __of_add_property_sysfs(struct device_node *np, struct property *pp)
        return rc;
 }
 
-static int __of_node_add(struct device_node *np)
+int __of_attach_node_sysfs(struct device_node *np)
 {
        const char *name;
        struct property *pp;
        int rc;
 
+       if (!of_kset)
+               return 0;
+
        np->kobj.kset = of_kset;
        if (!np->parent) {
                /* Nodes without parents are new top level trees */
@@ -245,59 +182,20 @@ static int __of_node_add(struct device_node *np)
        return 0;
 }
 
-int of_node_add(struct device_node *np)
-{
-       int rc = 0;
-
-       BUG_ON(!of_node_is_initialized(np));
-
-       /*
-        * Grab the mutex here so that in a race condition between of_init() and
-        * of_node_add(), node addition will still be consistent.
-        */
-       mutex_lock(&of_aliases_mutex);
-       if (of_kset)
-               rc = __of_node_add(np);
-       else
-               /* This scenario may be perfectly valid, but report it anyway */
-               pr_info("of_node_add(%s) before of_init()\n", np->full_name);
-       mutex_unlock(&of_aliases_mutex);
-       return rc;
-}
-
-#if defined(CONFIG_OF_DYNAMIC)
-static void of_node_remove(struct device_node *np)
-{
-       struct property *pp;
-
-       BUG_ON(!of_node_is_initialized(np));
-
-       /* only remove properties if on sysfs */
-       if (of_node_is_attached(np)) {
-               for_each_property_of_node(np, pp)
-                       sysfs_remove_bin_file(&np->kobj, &pp->attr);
-               kobject_del(&np->kobj);
-       }
-
-       /* finally remove the kobj_init ref */
-       of_node_put(np);
-}
-#endif
-
 static int __init of_init(void)
 {
        struct device_node *np;
 
        /* Create the kset, and register existing nodes */
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
        if (!of_kset) {
-               mutex_unlock(&of_aliases_mutex);
+               mutex_unlock(&of_mutex);
                return -ENOMEM;
        }
        for_each_of_allnodes(np)
-               __of_node_add(np);
-       mutex_unlock(&of_aliases_mutex);
+               __of_attach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
 
        /* Symlink in /proc as required by userspace ABI */
        if (of_allnodes)
@@ -369,8 +267,8 @@ EXPORT_SYMBOL(of_find_all_nodes);
  * Find a property with a given name for a given node
  * and return the value.
  */
-static const void *__of_get_property(const struct device_node *np,
-                                    const char *name, int *lenp)
+const void *__of_get_property(const struct device_node *np,
+                             const char *name, int *lenp)
 {
        struct property *pp = __of_find_property(np, name, lenp);
 
@@ -1748,32 +1646,10 @@ int of_count_phandle_with_args(const struct device_node *np, const char *list_na
 }
 EXPORT_SYMBOL(of_count_phandle_with_args);
 
-#if defined(CONFIG_OF_DYNAMIC)
-static int of_property_notify(int action, struct device_node *np,
-                             struct property *prop)
-{
-       struct of_prop_reconfig pr;
-
-       /* only call notifiers if the node is attached */
-       if (!of_node_is_attached(np))
-               return 0;
-
-       pr.dn = np;
-       pr.prop = prop;
-       return of_reconfig_notify(action, &pr);
-}
-#else
-static int of_property_notify(int action, struct device_node *np,
-                             struct property *prop)
-{
-       return 0;
-}
-#endif
-
 /**
  * __of_add_property - Add a property to a node without lock operations
  */
-static int __of_add_property(struct device_node *np, struct property *prop)
+int __of_add_property(struct device_node *np, struct property *prop)
 {
        struct property **next;
 
@@ -1799,22 +1675,49 @@ int of_add_property(struct device_node *np, struct property *prop)
        unsigned long flags;
        int rc;
 
-       rc = of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
        rc = __of_add_property(np, prop);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
-       if (rc)
-               return rc;
 
-       if (of_node_is_attached(np))
+       if (!rc)
                __of_add_property_sysfs(np, prop);
 
+       mutex_unlock(&of_mutex);
+
+       if (!rc)
+               of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL);
+
        return rc;
 }
 
+int __of_remove_property(struct device_node *np, struct property *prop)
+{
+       struct property **next;
+
+       for (next = &np->properties; *next; next = &(*next)->next) {
+               if (*next == prop)
+                       break;
+       }
+       if (*next == NULL)
+               return -ENODEV;
+
+       /* found the node */
+       *next = prop->next;
+       prop->next = np->deadprops;
+       np->deadprops = prop;
+
+       return 0;
+}
+
+void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
+{
+       /* at early boot, bail here and defer setup to of_init() */
+       if (of_kset && of_node_is_attached(np))
+               sysfs_remove_bin_file(&np->kobj, &prop->attr);
+}
+
 /**
  * of_remove_property - Remove a property from a node.
  *
@@ -1825,211 +1728,98 @@ int of_add_property(struct device_node *np, struct property *prop)
  */
 int of_remove_property(struct device_node *np, struct property *prop)
 {
-       struct property **next;
        unsigned long flags;
-       int found = 0;
        int rc;
 
-       rc = of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
-       next = &np->properties;
-       while (*next) {
-               if (*next == prop) {
-                       /* found the node */
-                       *next = prop->next;
-                       prop->next = np->deadprops;
-                       np->deadprops = prop;
-                       found = 1;
-                       break;
-               }
-               next = &(*next)->next;
-       }
+       rc = __of_remove_property(np, prop);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
-       if (!found)
-               return -ENODEV;
+       if (!rc)
+               __of_remove_property_sysfs(np, prop);
 
-       /* at early boot, bail hear and defer setup to of_init() */
-       if (!of_kset)
-               return 0;
+       mutex_unlock(&of_mutex);
 
-       sysfs_remove_bin_file(&np->kobj, &prop->attr);
+       if (!rc)
+               of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL);
 
-       return 0;
+       return rc;
 }
 
-/*
- * of_update_property - Update a property in a node, if the property does
- * not exist, add it.
- *
- * Note that we don't actually remove it, since we have given out
- * who-knows-how-many pointers to the data using get-property.
- * Instead we just move the property to the "dead properties" list,
- * and add the new property to the property list
- */
-int of_update_property(struct device_node *np, struct property *newprop)
+int __of_update_property(struct device_node *np, struct property *newprop,
+               struct property **oldpropp)
 {
        struct property **next, *oldprop;
-       unsigned long flags;
-       int rc;
-
-       rc = of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop);
-       if (rc)
-               return rc;
 
-       if (!newprop->name)
-               return -EINVAL;
+       for (next = &np->properties; *next; next = &(*next)->next) {
+               if (of_prop_cmp((*next)->name, newprop->name) == 0)
+                       break;
+       }
+       *oldpropp = oldprop = *next;
 
-       raw_spin_lock_irqsave(&devtree_lock, flags);
-       next = &np->properties;
-       oldprop = __of_find_property(np, newprop->name, NULL);
-       if (!oldprop) {
-               /* add the new node */
-               rc = __of_add_property(np, newprop);
-       } else while (*next) {
+       if (oldprop) {
                /* replace the node */
-               if (*next == oldprop) {
-                       newprop->next = oldprop->next;
-                       *next = newprop;
-                       oldprop->next = np->deadprops;
-                       np->deadprops = oldprop;
-                       break;
-               }
-               next = &(*next)->next;
+               newprop->next = oldprop->next;
+               *next = newprop;
+               oldprop->next = np->deadprops;
+               np->deadprops = oldprop;
+       } else {
+               /* new node */
+               newprop->next = NULL;
+               *next = newprop;
        }
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
-       if (rc)
-               return rc;
 
+       return 0;
+}
+
+void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
+               struct property *oldprop)
+{
        /* At early boot, bail out and defer setup to of_init() */
        if (!of_kset)
-               return 0;
+               return;
 
-       /* Update the sysfs attribute */
        if (oldprop)
                sysfs_remove_bin_file(&np->kobj, &oldprop->attr);
        __of_add_property_sysfs(np, newprop);
-
-       return 0;
 }
 
-#if defined(CONFIG_OF_DYNAMIC)
 /*
- * Support for dynamic device trees.
+ * of_update_property - Update a property in a node, if the property does
+ * not exist, add it.
  *
- * On some platforms, the device tree can be manipulated at runtime.
- * The routines in this section support adding, removing and changing
- * device tree nodes.
- */
-
-static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
-
-int of_reconfig_notifier_register(struct notifier_block *nb)
-{
-       return blocking_notifier_chain_register(&of_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
-
-int of_reconfig_notifier_unregister(struct notifier_block *nb)
-{
-       return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
-}
-EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
-
-int of_reconfig_notify(unsigned long action, void *p)
-{
-       int rc;
-
-       rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p);
-       return notifier_to_errno(rc);
-}
-
-/**
- * of_attach_node - Plug a device node into the tree and global list.
+ * Note that we don't actually remove it, since we have given out
+ * who-knows-how-many pointers to the data using get-property.
+ * Instead we just move the property to the "dead properties" list,
+ * and add the new property to the property list
  */
-int of_attach_node(struct device_node *np)
+int of_update_property(struct device_node *np, struct property *newprop)
 {
+       struct property *oldprop;
        unsigned long flags;
        int rc;
 
-       rc = of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np);
-       if (rc)
-               return rc;
-
-       raw_spin_lock_irqsave(&devtree_lock, flags);
-       np->sibling = np->parent->child;
-       np->allnext = np->parent->allnext;
-       np->parent->allnext = np;
-       np->parent->child = np;
-       of_node_clear_flag(np, OF_DETACHED);
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
-       of_node_add(np);
-       return 0;
-}
-
-/**
- * of_detach_node - "Unplug" a node from the device tree.
- *
- * The caller must hold a reference to the node.  The memory associated with
- * the node is not freed until its refcount goes to zero.
- */
-int of_detach_node(struct device_node *np)
-{
-       struct device_node *parent;
-       unsigned long flags;
-       int rc = 0;
+       if (!newprop->name)
+               return -EINVAL;
 
-       rc = of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np);
-       if (rc)
-               return rc;
+       mutex_lock(&of_mutex);
 
        raw_spin_lock_irqsave(&devtree_lock, flags);
+       rc = __of_update_property(np, newprop, &oldprop);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
 
-       if (of_node_check_flag(np, OF_DETACHED)) {
-               /* someone already detached it */
-               raw_spin_unlock_irqrestore(&devtree_lock, flags);
-               return rc;
-       }
-
-       parent = np->parent;
-       if (!parent) {
-               raw_spin_unlock_irqrestore(&devtree_lock, flags);
-               return rc;
-       }
+       if (!rc)
+               __of_update_property_sysfs(np, newprop, oldprop);
 
-       if (of_allnodes == np)
-               of_allnodes = np->allnext;
-       else {
-               struct device_node *prev;
-               for (prev = of_allnodes;
-                    prev->allnext != np;
-                    prev = prev->allnext)
-                       ;
-               prev->allnext = np->allnext;
-       }
+       mutex_unlock(&of_mutex);
 
-       if (parent->child == np)
-               parent->child = np->sibling;
-       else {
-               struct device_node *prevsib;
-               for (prevsib = np->parent->child;
-                    prevsib->sibling != np;
-                    prevsib = prevsib->sibling)
-                       ;
-               prevsib->sibling = np->sibling;
-       }
-
-       of_node_set_flag(np, OF_DETACHED);
-       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+       if (!rc)
+               of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop);
 
-       of_node_remove(np);
        return rc;
 }
-#endif /* defined(CONFIG_OF_DYNAMIC) */
 
 static void of_alias_add(struct alias_prop *ap, struct device_node *np,
                         int id, const char *stem, int stem_len)
@@ -2062,9 +1852,12 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
                of_chosen = of_find_node_by_path("/chosen@0");
 
        if (of_chosen) {
+               /* linux,stdout-path and /aliases/stdout are for legacy compatibility */
                const char *name = of_get_property(of_chosen, "stdout-path", NULL);
                if (!name)
                        name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+               if (IS_ENABLED(CONFIG_PPC) && !name)
+                       name = of_get_property(of_aliases, "stdout", NULL);
                if (name)
                        of_stdout = of_find_node_by_path(name);
        }
@@ -2122,7 +1915,7 @@ int of_alias_get_id(struct device_node *np, const char *stem)
        struct alias_prop *app;
        int id = -ENODEV;
 
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (strcmp(app->stem, stem) != 0)
                        continue;
@@ -2132,7 +1925,7 @@ int of_alias_get_id(struct device_node *np, const char *stem)
                        break;
                }
        }
-       mutex_unlock(&of_aliases_mutex);
+       mutex_unlock(&of_mutex);
 
        return id;
 }
@@ -2180,20 +1973,22 @@ const char *of_prop_next_string(struct property *prop, const char *cur)
 EXPORT_SYMBOL_GPL(of_prop_next_string);
 
 /**
- * of_device_is_stdout_path - check if a device node matches the
- *                            linux,stdout-path property
- *
- * Check if this device node matches the linux,stdout-path property
- * in the chosen node. return true if yes, false otherwise.
+ * of_console_check() - Test and setup console for DT setup
+ * @dn - Pointer to device node
+ * @name - Name to use for preferred console without index. ex. "ttyS"
+ * @index - Index to use for preferred console.
+ *
+ * Check if the given device node matches the stdout-path property in the
+ * /chosen node. If it does then register it as the preferred console and return
+ * TRUE. Otherwise return FALSE.
  */
-int of_device_is_stdout_path(struct device_node *dn)
+bool of_console_check(struct device_node *dn, char *name, int index)
 {
-       if (!of_stdout)
+       if (!dn || dn != of_stdout || console_set_on_cmdline)
                return false;
-
-       return of_stdout == dn;
+       return add_preferred_console(name, index, NULL);
 }
-EXPORT_SYMBOL_GPL(of_device_is_stdout_path);
+EXPORT_SYMBOL_GPL(of_console_check);
 
 /**
  *     of_find_next_cache_node - Find a node's subsidiary cache
index dafb9736ab9b59bae3a8cd52e1daa303ea076907..46d6c75c14040903edf9b14f2ec2030c179b2c1e 100644 (file)
@@ -160,7 +160,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
        add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen);
 
        seen = 0;
-       mutex_lock(&of_aliases_mutex);
+       mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (dev->of_node == app->np) {
                        add_uevent_var(env, "OF_ALIAS_%d=%s", seen,
@@ -168,7 +168,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
                        seen++;
                }
        }
-       mutex_unlock(&of_aliases_mutex);
+       mutex_unlock(&of_mutex);
 }
 
 int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env)
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
new file mode 100644 (file)
index 0000000..54fecc4
--- /dev/null
@@ -0,0 +1,660 @@
+/*
+ * Support for dynamic device trees.
+ *
+ * On some platforms, the device tree can be manipulated at runtime.
+ * The routines in this section support adding, removing and changing
+ * device tree nodes.
+ */
+
+#include <linux/of.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+
+#include "of_private.h"
+
+/**
+ * of_node_get() - Increment refcount of a node
+ * @node:      Node to inc refcount, NULL is supported to simplify writing of
+ *             callers
+ *
+ * Returns node.
+ */
+struct device_node *of_node_get(struct device_node *node)
+{
+       if (node)
+               kobject_get(&node->kobj);
+       return node;
+}
+EXPORT_SYMBOL(of_node_get);
+
+/**
+ * of_node_put() - Decrement refcount of a node
+ * @node:      Node to dec refcount, NULL is supported to simplify writing of
+ *             callers
+ */
+void of_node_put(struct device_node *node)
+{
+       if (node)
+               kobject_put(&node->kobj);
+}
+EXPORT_SYMBOL(of_node_put);
+
+void __of_detach_node_sysfs(struct device_node *np)
+{
+       struct property *pp;
+
+       BUG_ON(!of_node_is_initialized(np));
+       if (!of_kset)
+               return;
+
+       /* only remove properties if on sysfs */
+       if (of_node_is_attached(np)) {
+               for_each_property_of_node(np, pp)
+                       sysfs_remove_bin_file(&np->kobj, &pp->attr);
+               kobject_del(&np->kobj);
+       }
+
+       /* finally remove the kobj_init ref */
+       of_node_put(np);
+}
+
+static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
+
+int of_reconfig_notifier_register(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_register);
+
+int of_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_unregister(&of_reconfig_chain, nb);
+}
+EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
+
+int of_reconfig_notify(unsigned long action, void *p)
+{
+       int rc;
+
+       rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p);
+       return notifier_to_errno(rc);
+}
+
+int of_property_notify(int action, struct device_node *np,
+                      struct property *prop, struct property *oldprop)
+{
+       struct of_prop_reconfig pr;
+
+       /* only call notifiers if the node is attached */
+       if (!of_node_is_attached(np))
+               return 0;
+
+       pr.dn = np;
+       pr.prop = prop;
+       pr.old_prop = oldprop;
+       return of_reconfig_notify(action, &pr);
+}
+
+void __of_attach_node(struct device_node *np)
+{
+       const __be32 *phandle;
+       int sz;
+
+       np->name = __of_get_property(np, "name", NULL) ? : "<NULL>";
+       np->type = __of_get_property(np, "device_type", NULL) ? : "<NULL>";
+
+       phandle = __of_get_property(np, "phandle", &sz);
+       if (!phandle)
+               phandle = __of_get_property(np, "linux,phandle", &sz);
+       if (IS_ENABLED(PPC_PSERIES) && !phandle)
+               phandle = __of_get_property(np, "ibm,phandle", &sz);
+       np->phandle = (phandle && (sz >= 4)) ? be32_to_cpup(phandle) : 0;
+
+       np->child = NULL;
+       np->sibling = np->parent->child;
+       np->allnext = np->parent->allnext;
+       np->parent->allnext = np;
+       np->parent->child = np;
+       of_node_clear_flag(np, OF_DETACHED);
+}
+
+/**
+ * of_attach_node() - Plug a device node into the tree and global list.
+ */
+int of_attach_node(struct device_node *np)
+{
+       unsigned long flags;
+
+       mutex_lock(&of_mutex);
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       __of_attach_node(np);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       __of_attach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
+
+       of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np);
+
+       return 0;
+}
+
+void __of_detach_node(struct device_node *np)
+{
+       struct device_node *parent;
+
+       if (WARN_ON(of_node_check_flag(np, OF_DETACHED)))
+               return;
+
+       parent = np->parent;
+       if (WARN_ON(!parent))
+               return;
+
+       if (of_allnodes == np)
+               of_allnodes = np->allnext;
+       else {
+               struct device_node *prev;
+               for (prev = of_allnodes;
+                    prev->allnext != np;
+                    prev = prev->allnext)
+                       ;
+               prev->allnext = np->allnext;
+       }
+
+       if (parent->child == np)
+               parent->child = np->sibling;
+       else {
+               struct device_node *prevsib;
+               for (prevsib = np->parent->child;
+                    prevsib->sibling != np;
+                    prevsib = prevsib->sibling)
+                       ;
+               prevsib->sibling = np->sibling;
+       }
+
+       of_node_set_flag(np, OF_DETACHED);
+}
+
+/**
+ * of_detach_node() - "Unplug" a node from the device tree.
+ *
+ * The caller must hold a reference to the node.  The memory associated with
+ * the node is not freed until its refcount goes to zero.
+ */
+int of_detach_node(struct device_node *np)
+{
+       unsigned long flags;
+       int rc = 0;
+
+       mutex_lock(&of_mutex);
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       __of_detach_node(np);
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       __of_detach_node_sysfs(np);
+       mutex_unlock(&of_mutex);
+
+       of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np);
+
+       return rc;
+}
+
+/**
+ * of_node_release() - release a dynamically allocated node
+ * @kref: kref element of the node to be released
+ *
+ * In of_node_put() this function is passed to kref_put() as the destructor.
+ */
+void of_node_release(struct kobject *kobj)
+{
+       struct device_node *node = kobj_to_device_node(kobj);
+       struct property *prop = node->properties;
+
+       /* We should never be releasing nodes that haven't been detached. */
+       if (!of_node_check_flag(node, OF_DETACHED)) {
+               pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name);
+               dump_stack();
+               return;
+       }
+
+       if (!of_node_check_flag(node, OF_DYNAMIC))
+               return;
+
+       while (prop) {
+               struct property *next = prop->next;
+               kfree(prop->name);
+               kfree(prop->value);
+               kfree(prop);
+               prop = next;
+
+               if (!prop) {
+                       prop = node->deadprops;
+                       node->deadprops = NULL;
+               }
+       }
+       kfree(node->full_name);
+       kfree(node->data);
+       kfree(node);
+}
+
+/**
+ * __of_prop_dup - Copy a property dynamically.
+ * @prop:      Property to copy
+ * @allocflags:        Allocation flags (typically pass GFP_KERNEL)
+ *
+ * Copy a property by dynamically allocating the memory of both the
+ * property stucture and the property name & contents. The property's
+ * flags have the OF_DYNAMIC bit set so that we can differentiate between
+ * dynamically allocated properties and not.
+ * Returns the newly allocated property or NULL on out of memory error.
+ */
+struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags)
+{
+       struct property *new;
+
+       new = kzalloc(sizeof(*new), allocflags);
+       if (!new)
+               return NULL;
+
+       /*
+        * NOTE: There is no check for zero length value.
+        * In case of a boolean property, this will allocate a value
+        * of zero bytes. We do this to work around the use
+        * of of_get_property() calls on boolean values.
+        */
+       new->name = kstrdup(prop->name, allocflags);
+       new->value = kmemdup(prop->value, prop->length, allocflags);
+       new->length = prop->length;
+       if (!new->name || !new->value)
+               goto err_free;
+
+       /* mark the property as dynamic */
+       of_property_set_flag(new, OF_DYNAMIC);
+
+       return new;
+
+ err_free:
+       kfree(new->name);
+       kfree(new->value);
+       kfree(new);
+       return NULL;
+}
+
+/**
+ * __of_node_alloc() - Create an empty device node dynamically.
+ * @full_name: Full name of the new device node
+ * @allocflags:        Allocation flags (typically pass GFP_KERNEL)
+ *
+ * Create an empty device tree node, suitable for further modification.
+ * The node data are dynamically allocated and all the node flags
+ * have the OF_DYNAMIC & OF_DETACHED bits set.
+ * Returns the newly allocated node or NULL on out of memory error.
+ */
+struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags)
+{
+       struct device_node *node;
+
+       node = kzalloc(sizeof(*node), allocflags);
+       if (!node)
+               return NULL;
+
+       node->full_name = kstrdup(full_name, allocflags);
+       of_node_set_flag(node, OF_DYNAMIC);
+       of_node_set_flag(node, OF_DETACHED);
+       if (!node->full_name)
+               goto err_free;
+
+       of_node_init(node);
+
+       return node;
+
+ err_free:
+       kfree(node->full_name);
+       kfree(node);
+       return NULL;
+}
+
+static void __of_changeset_entry_destroy(struct of_changeset_entry *ce)
+{
+       of_node_put(ce->np);
+       list_del(&ce->node);
+       kfree(ce);
+}
+
+#ifdef DEBUG
+static void __of_changeset_entry_dump(struct of_changeset_entry *ce)
+{
+       switch (ce->action) {
+       case OF_RECONFIG_ADD_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "ADD_PROPERTY   ", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "REMOVE_PROPERTY", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               pr_debug("%p: %s %s/%s\n",
+                       ce, "UPDATE_PROPERTY", ce->np->full_name,
+                       ce->prop->name);
+               break;
+       case OF_RECONFIG_ATTACH_NODE:
+               pr_debug("%p: %s %s\n",
+                       ce, "ATTACH_NODE    ", ce->np->full_name);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               pr_debug("%p: %s %s\n",
+                       ce, "DETACH_NODE    ", ce->np->full_name);
+               break;
+       }
+}
+#else
+static inline void __of_changeset_entry_dump(struct of_changeset_entry *ce)
+{
+       /* empty */
+}
+#endif
+
+static void __of_changeset_entry_invert(struct of_changeset_entry *ce,
+                                         struct of_changeset_entry *rce)
+{
+       memcpy(rce, ce, sizeof(*rce));
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               rce->action = OF_RECONFIG_DETACH_NODE;
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               rce->action = OF_RECONFIG_ATTACH_NODE;
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               rce->action = OF_RECONFIG_REMOVE_PROPERTY;
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               rce->action = OF_RECONFIG_ADD_PROPERTY;
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               rce->old_prop = ce->prop;
+               rce->prop = ce->old_prop;
+               break;
+       }
+}
+
+static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool revert)
+{
+       struct of_changeset_entry ce_inverted;
+       int ret;
+
+       if (revert) {
+               __of_changeset_entry_invert(ce, &ce_inverted);
+               ce = &ce_inverted;
+       }
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+       case OF_RECONFIG_DETACH_NODE:
+               ret = of_reconfig_notify(ce->action, ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+       case OF_RECONFIG_REMOVE_PROPERTY:
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               ret = of_property_notify(ce->action, ce->np, ce->prop, ce->old_prop);
+               break;
+       default:
+               pr_err("%s: invalid devicetree changeset action: %i\n", __func__,
+                       (int)ce->action);
+               return;
+       }
+
+       if (ret)
+               pr_err("%s: notifier error @%s\n", __func__, ce->np->full_name);
+}
+
+static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
+{
+       struct property *old_prop, **propp;
+       unsigned long flags;
+       int ret = 0;
+
+       __of_changeset_entry_dump(ce);
+
+       raw_spin_lock_irqsave(&devtree_lock, flags);
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               __of_attach_node(ce->np);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               __of_detach_node(ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               /* If the property is in deadprops then it must be removed */
+               for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) {
+                       if (*propp == ce->prop) {
+                               *propp = ce->prop->next;
+                               ce->prop->next = NULL;
+                               break;
+                       }
+               }
+
+               ret = __of_add_property(ce->np, ce->prop);
+               if (ret) {
+                       pr_err("%s: add_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               ret = __of_remove_property(ce->np, ce->prop);
+               if (ret) {
+                       pr_err("%s: remove_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               /* If the property is in deadprops then it must be removed */
+               for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) {
+                       if (*propp == ce->prop) {
+                               *propp = ce->prop->next;
+                               ce->prop->next = NULL;
+                               break;
+                       }
+               }
+
+               ret = __of_update_property(ce->np, ce->prop, &old_prop);
+               if (ret) {
+                       pr_err("%s: update_property failed @%s/%s\n",
+                               __func__, ce->np->full_name,
+                               ce->prop->name);
+                       break;
+               }
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+       if (ret)
+               return ret;
+
+       switch (ce->action) {
+       case OF_RECONFIG_ATTACH_NODE:
+               __of_attach_node_sysfs(ce->np);
+               break;
+       case OF_RECONFIG_DETACH_NODE:
+               __of_detach_node_sysfs(ce->np);
+               break;
+       case OF_RECONFIG_ADD_PROPERTY:
+               /* ignore duplicate names */
+               __of_add_property_sysfs(ce->np, ce->prop);
+               break;
+       case OF_RECONFIG_REMOVE_PROPERTY:
+               __of_remove_property_sysfs(ce->np, ce->prop);
+               break;
+       case OF_RECONFIG_UPDATE_PROPERTY:
+               __of_update_property_sysfs(ce->np, ce->prop, ce->old_prop);
+               break;
+       }
+
+       return 0;
+}
+
+static inline int __of_changeset_entry_revert(struct of_changeset_entry *ce)
+{
+       struct of_changeset_entry ce_inverted;
+
+       __of_changeset_entry_invert(ce, &ce_inverted);
+       return __of_changeset_entry_apply(&ce_inverted);
+}
+
+/**
+ * of_changeset_init - Initialize a changeset for use
+ *
+ * @ocs:       changeset pointer
+ *
+ * Initialize a changeset structure
+ */
+void of_changeset_init(struct of_changeset *ocs)
+{
+       memset(ocs, 0, sizeof(*ocs));
+       INIT_LIST_HEAD(&ocs->entries);
+}
+
+/**
+ * of_changeset_destroy - Destroy a changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Destroys a changeset. Note that if a changeset is applied,
+ * its changes to the tree cannot be reverted.
+ */
+void of_changeset_destroy(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce, *cen;
+
+       list_for_each_entry_safe_reverse(ce, cen, &ocs->entries, node)
+               __of_changeset_entry_destroy(ce);
+}
+
+/**
+ * of_changeset_apply - Applies a changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Applies a changeset to the live tree.
+ * Any side-effects of live tree state changes are applied here on
+ * sucess, like creation/destruction of devices and side-effects
+ * like creation of sysfs properties and directories.
+ * Returns 0 on success, a negative error value in case of an error.
+ * On error the partially applied effects are reverted.
+ */
+int of_changeset_apply(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce;
+       int ret;
+
+       /* perform the rest of the work */
+       pr_debug("of_changeset: applying...\n");
+       list_for_each_entry(ce, &ocs->entries, node) {
+               ret = __of_changeset_entry_apply(ce);
+               if (ret) {
+                       pr_err("%s: Error applying changeset (%d)\n", __func__, ret);
+                       list_for_each_entry_continue_reverse(ce, &ocs->entries, node)
+                               __of_changeset_entry_revert(ce);
+                       return ret;
+               }
+       }
+       pr_debug("of_changeset: applied, emitting notifiers.\n");
+
+       /* drop the global lock while emitting notifiers */
+       mutex_unlock(&of_mutex);
+       list_for_each_entry(ce, &ocs->entries, node)
+               __of_changeset_entry_notify(ce, 0);
+       mutex_lock(&of_mutex);
+       pr_debug("of_changeset: notifiers sent.\n");
+
+       return 0;
+}
+
+/**
+ * of_changeset_revert - Reverts an applied changeset
+ *
+ * @ocs:       changeset pointer
+ *
+ * Reverts a changeset returning the state of the tree to what it
+ * was before the application.
+ * Any side-effects like creation/destruction of devices and
+ * removal of sysfs properties and directories are applied.
+ * Returns 0 on success, a negative error value in case of an error.
+ */
+int of_changeset_revert(struct of_changeset *ocs)
+{
+       struct of_changeset_entry *ce;
+       int ret;
+
+       pr_debug("of_changeset: reverting...\n");
+       list_for_each_entry_reverse(ce, &ocs->entries, node) {
+               ret = __of_changeset_entry_revert(ce);
+               if (ret) {
+                       pr_err("%s: Error reverting changeset (%d)\n", __func__, ret);
+                       list_for_each_entry_continue(ce, &ocs->entries, node)
+                               __of_changeset_entry_apply(ce);
+                       return ret;
+               }
+       }
+       pr_debug("of_changeset: reverted, emitting notifiers.\n");
+
+       /* drop the global lock while emitting notifiers */
+       mutex_unlock(&of_mutex);
+       list_for_each_entry_reverse(ce, &ocs->entries, node)
+               __of_changeset_entry_notify(ce, 1);
+       mutex_lock(&of_mutex);
+       pr_debug("of_changeset: notifiers sent.\n");
+
+       return 0;
+}
+
+/**
+ * of_changeset_action - Perform a changeset action
+ *
+ * @ocs:       changeset pointer
+ * @action:    action to perform
+ * @np:                Pointer to device node
+ * @prop:      Pointer to property
+ *
+ * On action being one of:
+ * + OF_RECONFIG_ATTACH_NODE
+ * + OF_RECONFIG_DETACH_NODE,
+ * + OF_RECONFIG_ADD_PROPERTY
+ * + OF_RECONFIG_REMOVE_PROPERTY,
+ * + OF_RECONFIG_UPDATE_PROPERTY
+ * Returns 0 on success, a negative error value in case of an error.
+ */
+int of_changeset_action(struct of_changeset *ocs, unsigned long action,
+               struct device_node *np, struct property *prop)
+{
+       struct of_changeset_entry *ce;
+
+       ce = kzalloc(sizeof(*ce), GFP_KERNEL);
+       if (!ce) {
+               pr_err("%s: Failed to allocate\n", __func__);
+               return -ENOMEM;
+       }
+       /* get a reference to the node */
+       ce->action = action;
+       ce->np = of_node_get(np);
+       ce->prop = prop;
+
+       if (action == OF_RECONFIG_UPDATE_PROPERTY && prop)
+               ce->old_prop = of_find_property(np, prop->name, NULL);
+
+       /* add it to the list */
+       list_add_tail(&ce->node, &ocs->entries);
+       return 0;
+}
index 9aa012e6ea0a6ed988150624bacc0eb2b8066c06..f46a24ffa3fe7be040d488bc49bfba912848fde1 100644 (file)
@@ -923,24 +923,24 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK
+#define MAX_PHYS_ADDR  ((phys_addr_t)~0)
+
 void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
 {
        const u64 phys_offset = __pa(PAGE_OFFSET);
        base &= PAGE_MASK;
        size &= PAGE_MASK;
 
-       if (sizeof(phys_addr_t) < sizeof(u64)) {
-               if (base > ULONG_MAX) {
-                       pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
-                                       base, base + size);
-                       return;
-               }
+       if (base > MAX_PHYS_ADDR) {
+               pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
+                               base, base + size);
+               return;
+       }
 
-               if (base + size > ULONG_MAX) {
-                       pr_warning("Ignoring memory range 0x%lx - 0x%llx\n",
-                                       ULONG_MAX, base + size);
-                       size = ULONG_MAX - base;
-               }
+       if (base + size > MAX_PHYS_ADDR) {
+               pr_warning("Ignoring memory range 0x%lx - 0x%llx\n",
+                               ULONG_MAX, base + size);
+               size = MAX_PHYS_ADDR - base;
        }
 
        if (base + size < phys_offset) {
index ff350c8fa7acc4398c088ddc5d346906df3a5efd..858e0a5d9a115fc1a53b3d324207420d2e29ff40 100644 (file)
@@ -31,6 +31,63 @@ struct alias_prop {
        char stem[0];
 };
 
-extern struct mutex of_aliases_mutex;
+extern struct mutex of_mutex;
 extern struct list_head aliases_lookup;
+extern struct kset *of_kset;
+
+
+static inline struct device_node *kobj_to_device_node(struct kobject *kobj)
+{
+       return container_of(kobj, struct device_node, kobj);
+}
+
+#if defined(CONFIG_OF_DYNAMIC)
+extern int of_property_notify(int action, struct device_node *np,
+                             struct property *prop, struct property *old_prop);
+extern void of_node_release(struct kobject *kobj);
+#else /* CONFIG_OF_DYNAMIC */
+static inline int of_property_notify(int action, struct device_node *np,
+                                    struct property *prop, struct property *old_prop)
+{
+       return 0;
+}
+#endif /* CONFIG_OF_DYNAMIC */
+
+/**
+ * General utilities for working with live trees.
+ *
+ * All functions with two leading underscores operate
+ * without taking node references, so you either have to
+ * own the devtree lock or work on detached trees only.
+ */
+struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags);
+struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags);
+
+extern const void *__of_get_property(const struct device_node *np,
+                                    const char *name, int *lenp);
+extern int __of_add_property(struct device_node *np, struct property *prop);
+extern int __of_add_property_sysfs(struct device_node *np,
+               struct property *prop);
+extern int __of_remove_property(struct device_node *np, struct property *prop);
+extern void __of_remove_property_sysfs(struct device_node *np,
+               struct property *prop);
+extern int __of_update_property(struct device_node *np,
+               struct property *newprop, struct property **oldprop);
+extern void __of_update_property_sysfs(struct device_node *np,
+               struct property *newprop, struct property *oldprop);
+
+extern void __of_attach_node(struct device_node *np);
+extern int __of_attach_node_sysfs(struct device_node *np);
+extern void __of_detach_node(struct device_node *np);
+extern void __of_detach_node_sysfs(struct device_node *np);
+
+/* iterators for transactions, used for overlays */
+/* forward iterator */
+#define for_each_transaction_entry(_oft, _te) \
+       list_for_each_entry(_te, &(_oft)->te_list, node)
+
+/* reverse iterator */
+#define for_each_transaction_entry_reverse(_oft, _te) \
+       list_for_each_entry_reverse(_te, &(_oft)->te_list, node)
+
 #endif /* _LINUX_OF_PRIVATE_H */
index 632aae8613756b6d1c433485e6ad0d075f11af66..59fb12e84e6bb8da22f62c6de25817cf3d743682 100644 (file)
@@ -206,8 +206,16 @@ void __init fdt_init_reserved_mem(void)
        for (i = 0; i < reserved_mem_count; i++) {
                struct reserved_mem *rmem = &reserved_mem[i];
                unsigned long node = rmem->fdt_node;
+               int len;
+               const __be32 *prop;
                int err = 0;
 
+               prop = of_get_flat_dt_prop(node, "phandle", &len);
+               if (!prop)
+                       prop = of_get_flat_dt_prop(node, "linux,phandle", &len);
+               if (prop)
+                       rmem->phandle = of_read_number(prop, len/4);
+
                if (rmem->size == 0)
                        err = __reserved_mem_alloc_size(node, rmem->name,
                                                 &rmem->base, &rmem->size);
@@ -215,3 +223,65 @@ void __init fdt_init_reserved_mem(void)
                        __reserved_mem_init_node(rmem);
        }
 }
+
+static inline struct reserved_mem *__find_rmem(struct device_node *node)
+{
+       unsigned int i;
+
+       if (!node->phandle)
+               return NULL;
+
+       for (i = 0; i < reserved_mem_count; i++)
+               if (reserved_mem[i].phandle == node->phandle)
+                       return &reserved_mem[i];
+       return NULL;
+}
+
+/**
+ * of_reserved_mem_device_init() - assign reserved memory region to given device
+ *
+ * This function assign memory region pointed by "memory-region" device tree
+ * property to the given device.
+ */
+void of_reserved_mem_device_init(struct device *dev)
+{
+       struct reserved_mem *rmem;
+       struct device_node *np;
+
+       np = of_parse_phandle(dev->of_node, "memory-region", 0);
+       if (!np)
+               return;
+
+       rmem = __find_rmem(np);
+       of_node_put(np);
+
+       if (!rmem || !rmem->ops || !rmem->ops->device_init)
+               return;
+
+       rmem->ops->device_init(rmem, dev);
+       dev_info(dev, "assigned reserved memory node %s\n", rmem->name);
+}
+
+/**
+ * of_reserved_mem_device_release() - release reserved memory device structures
+ *
+ * This function releases structures allocated for memory region handling for
+ * the given device.
+ */
+void of_reserved_mem_device_release(struct device *dev)
+{
+       struct reserved_mem *rmem;
+       struct device_node *np;
+
+       np = of_parse_phandle(dev->of_node, "memory-region", 0);
+       if (!np)
+               return;
+
+       rmem = __find_rmem(np);
+       of_node_put(np);
+
+       if (!rmem || !rmem->ops || !rmem->ops->device_release)
+               return;
+
+       rmem->ops->device_release(rmem, dev);
+}
index 500436f9be7f8257e17965e53eeb1dcf276221d3..0197725e033a6a8d667c6a9e1e593c60010b3608 100644 (file)
@@ -422,6 +422,7 @@ static int of_platform_bus_create(struct device_node *bus,
                        break;
                }
        }
+       of_node_set_flag(bus, OF_POPULATED_BUS);
        return rc;
 }
 
@@ -508,19 +509,13 @@ EXPORT_SYMBOL_GPL(of_platform_populate);
 
 static int of_platform_device_destroy(struct device *dev, void *data)
 {
-       bool *children_left = data;
-
        /* Do not touch devices not populated from the device tree */
-       if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) {
-               *children_left = true;
+       if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED))
                return 0;
-       }
 
-       /* Recurse, but don't touch this device if it has any children left */
-       if (of_platform_depopulate(dev) != 0) {
-               *children_left = true;
-               return 0;
-       }
+       /* Recurse for any nodes that were treated as busses */
+       if (of_node_check_flag(dev->of_node, OF_POPULATED_BUS))
+               device_for_each_child(dev, NULL, of_platform_device_destroy);
 
        if (dev->bus == &platform_bus_type)
                platform_device_unregister(to_platform_device(dev));
@@ -528,19 +523,15 @@ static int of_platform_device_destroy(struct device *dev, void *data)
        else if (dev->bus == &amba_bustype)
                amba_device_unregister(to_amba_device(dev));
 #endif
-       else {
-               *children_left = true;
-               return 0;
-       }
 
        of_node_clear_flag(dev->of_node, OF_POPULATED);
-
+       of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
        return 0;
 }
 
 /**
  * of_platform_depopulate() - Remove devices populated from device tree
- * @parent: device which childred will be removed
+ * @parent: device which children will be removed
  *
  * Complementary to of_platform_populate(), this function removes children
  * of the given device (and, recurrently, their children) that have been
@@ -550,14 +541,9 @@ static int of_platform_device_destroy(struct device *dev, void *data)
  * Returns 0 when all children devices have been removed or
  * -EBUSY when some children remained.
  */
-int of_platform_depopulate(struct device *parent)
+void of_platform_depopulate(struct device *parent)
 {
-       bool children_left = false;
-
-       device_for_each_child(parent, &children_left,
-                             of_platform_device_destroy);
-
-       return children_left ? -EBUSY : 0;
+       device_for_each_child(parent, NULL, of_platform_device_destroy);
 }
 EXPORT_SYMBOL_GPL(of_platform_depopulate);
 
index 077314eebb95c785b1f1cb8f46790f6004464ee3..d410026678334885bae9e51893fde5f9873a7831 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_fdt.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/device.h>
 
+#include "of_private.h"
+
 static struct selftest_results {
        int passed;
        int failed;
 } selftest_results;
 
+#define NO_OF_NODES 2
+static struct device_node *nodes[NO_OF_NODES];
+static int last_node_index;
+
 #define selftest(result, fmt, ...) { \
        if (!(result)) { \
                selftest_results.failed++; \
@@ -266,6 +273,81 @@ static void __init of_selftest_property_match_string(void)
        selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc);
 }
 
+#define propcmp(p1, p2) (((p1)->length == (p2)->length) && \
+                       (p1)->value && (p2)->value && \
+                       !memcmp((p1)->value, (p2)->value, (p1)->length) && \
+                       !strcmp((p1)->name, (p2)->name))
+static void __init of_selftest_property_copy(void)
+{
+#ifdef CONFIG_OF_DYNAMIC
+       struct property p1 = { .name = "p1", .length = 0, .value = "" };
+       struct property p2 = { .name = "p2", .length = 5, .value = "abcd" };
+       struct property *new;
+
+       new = __of_prop_dup(&p1, GFP_KERNEL);
+       selftest(new && propcmp(&p1, new), "empty property didn't copy correctly\n");
+       kfree(new->value);
+       kfree(new->name);
+       kfree(new);
+
+       new = __of_prop_dup(&p2, GFP_KERNEL);
+       selftest(new && propcmp(&p2, new), "non-empty property didn't copy correctly\n");
+       kfree(new->value);
+       kfree(new->name);
+       kfree(new);
+#endif
+}
+
+static void __init of_selftest_changeset(void)
+{
+#ifdef CONFIG_OF_DYNAMIC
+       struct property *ppadd, padd = { .name = "prop-add", .length = 0, .value = "" };
+       struct property *ppupdate, pupdate = { .name = "prop-update", .length = 5, .value = "abcd" };
+       struct property *ppremove;
+       struct device_node *n1, *n2, *n21, *nremove, *parent;
+       struct of_changeset chgset;
+
+       of_changeset_init(&chgset);
+       n1 = __of_node_alloc("/testcase-data/changeset/n1", GFP_KERNEL);
+       selftest(n1, "testcase setup failure\n");
+       n2 = __of_node_alloc("/testcase-data/changeset/n2", GFP_KERNEL);
+       selftest(n2, "testcase setup failure\n");
+       n21 = __of_node_alloc("/testcase-data/changeset/n2/n21", GFP_KERNEL);
+       selftest(n21, "testcase setup failure %p\n", n21);
+       nremove = of_find_node_by_path("/testcase-data/changeset/node-remove");
+       selftest(nremove, "testcase setup failure\n");
+       ppadd = __of_prop_dup(&padd, GFP_KERNEL);
+       selftest(ppadd, "testcase setup failure\n");
+       ppupdate = __of_prop_dup(&pupdate, GFP_KERNEL);
+       selftest(ppupdate, "testcase setup failure\n");
+       parent = nremove->parent;
+       n1->parent = parent;
+       n2->parent = parent;
+       n21->parent = n2;
+       n2->child = n21;
+       ppremove = of_find_property(parent, "prop-remove", NULL);
+       selftest(ppremove, "failed to find removal prop");
+
+       of_changeset_init(&chgset);
+       selftest(!of_changeset_attach_node(&chgset, n1), "fail attach n1\n");
+       selftest(!of_changeset_attach_node(&chgset, n2), "fail attach n2\n");
+       selftest(!of_changeset_detach_node(&chgset, nremove), "fail remove node\n");
+       selftest(!of_changeset_attach_node(&chgset, n21), "fail attach n21\n");
+       selftest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop\n");
+       selftest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
+       selftest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
+       mutex_lock(&of_mutex);
+       selftest(!of_changeset_apply(&chgset), "apply failed\n");
+       mutex_unlock(&of_mutex);
+
+       mutex_lock(&of_mutex);
+       selftest(!of_changeset_revert(&chgset), "revert failed\n");
+       mutex_unlock(&of_mutex);
+
+       of_changeset_destroy(&chgset);
+#endif
+}
+
 static void __init of_selftest_parse_interrupts(void)
 {
        struct device_node *np;
@@ -517,9 +599,156 @@ static void __init of_selftest_platform_populate(void)
        }
 }
 
+/**
+ *     update_node_properties - adds the properties
+ *     of np into dup node (present in live tree) and
+ *     updates parent of children of np to dup.
+ *
+ *     @np:    node already present in live tree
+ *     @dup:   node present in live tree to be updated
+ */
+static void update_node_properties(struct device_node *np,
+                                       struct device_node *dup)
+{
+       struct property *prop;
+       struct device_node *child;
+
+       for_each_property_of_node(np, prop)
+               of_add_property(dup, prop);
+
+       for_each_child_of_node(np, child)
+               child->parent = dup;
+}
+
+/**
+ *     attach_node_and_children - attaches nodes
+ *     and its children to live tree
+ *
+ *     @np:    Node to attach to live tree
+ */
+static int attach_node_and_children(struct device_node *np)
+{
+       struct device_node *next, *root = np, *dup;
+
+       if (!np) {
+               pr_warn("%s: No tree to attach; not running tests\n",
+                       __func__);
+               return -ENODATA;
+       }
+
+
+       /* skip root node */
+       np = np->child;
+       /* storing a copy in temporary node */
+       dup = np;
+
+       while (dup) {
+               nodes[last_node_index++] = dup;
+               dup = dup->sibling;
+       }
+       dup = NULL;
+
+       while (np) {
+               next = np->allnext;
+               dup = of_find_node_by_path(np->full_name);
+               if (dup)
+                       update_node_properties(np, dup);
+               else {
+                       np->child = NULL;
+                       if (np->parent == root)
+                               np->parent = of_allnodes;
+                       of_attach_node(np);
+               }
+               np = next;
+       }
+
+       return 0;
+}
+
+/**
+ *     selftest_data_add - Reads, copies data from
+ *     linked tree and attaches it to the live tree
+ */
+static int __init selftest_data_add(void)
+{
+       void *selftest_data;
+       struct device_node *selftest_data_node;
+       extern uint8_t __dtb_testcases_begin[];
+       extern uint8_t __dtb_testcases_end[];
+       const int size = __dtb_testcases_end - __dtb_testcases_begin;
+
+       if (!size || !of_allnodes) {
+               pr_warn("%s: No testcase data to attach; not running tests\n",
+                       __func__);
+               return -ENODATA;
+       }
+
+       /* creating copy */
+       selftest_data = kmemdup(__dtb_testcases_begin, size, GFP_KERNEL);
+
+       if (!selftest_data) {
+               pr_warn("%s: Failed to allocate memory for selftest_data; "
+                       "not running tests\n", __func__);
+               return -ENOMEM;
+       }
+       of_fdt_unflatten_tree(selftest_data, &selftest_data_node);
+
+       /* attach the sub-tree to live tree */
+       return attach_node_and_children(selftest_data_node);
+}
+
+/**
+ *     detach_node_and_children - detaches node
+ *     and its children from live tree
+ *
+ *     @np:    Node to detach from live tree
+ */
+static void detach_node_and_children(struct device_node *np)
+{
+       while (np->child)
+               detach_node_and_children(np->child);
+
+       while (np->sibling)
+               detach_node_and_children(np->sibling);
+
+       of_detach_node(np);
+}
+
+/**
+ *     selftest_data_remove - removes the selftest data
+ *     nodes from the live tree
+ */
+static void selftest_data_remove(void)
+{
+       struct device_node *np;
+       struct property *prop;
+
+       while (last_node_index >= 0) {
+               if (nodes[last_node_index]) {
+                       np = of_find_node_by_path(nodes[last_node_index]->full_name);
+                       if (strcmp(np->full_name, "/aliases") != 0) {
+                               detach_node_and_children(np->child);
+                               of_detach_node(np);
+                       } else {
+                               for_each_property_of_node(np, prop) {
+                                       if (strcmp(prop->name, "testcase-alias") == 0)
+                                               of_remove_property(np, prop);
+                               }
+                       }
+               }
+               last_node_index--;
+       }
+}
+
 static int __init of_selftest(void)
 {
        struct device_node *np;
+       int res;
+
+       /* adding data for selftest */
+       res = selftest_data_add();
+       if (res)
+               return res;
 
        np = of_find_node_by_path("/testcase-data/phandle-tests/consumer-a");
        if (!np) {
@@ -533,12 +762,18 @@ static int __init of_selftest(void)
        of_selftest_dynamic();
        of_selftest_parse_phandle_with_args();
        of_selftest_property_match_string();
+       of_selftest_property_copy();
+       of_selftest_changeset();
        of_selftest_parse_interrupts();
        of_selftest_parse_interrupts_extended();
        of_selftest_match_node();
        of_selftest_platform_populate();
        pr_info("end of selftest - %i passed, %i failed\n",
                selftest_results.passed, selftest_results.failed);
+
+       /* removing selftest data from live tree */
+       selftest_data_remove();
+
        return 0;
 }
 late_initcall(of_selftest);
diff --git a/drivers/of/testcase-data/testcases.dts b/drivers/of/testcase-data/testcases.dts
new file mode 100644 (file)
index 0000000..219ef93
--- /dev/null
@@ -0,0 +1,15 @@
+/dts-v1/;
+/ {
+       testcase-data {
+               changeset {
+                       prop-update = "hello";
+                       prop-remove = "world";
+                       node-remove {
+                       };
+               };
+       };
+};
+#include "tests-phandle.dtsi"
+#include "tests-interrupts.dtsi"
+#include "tests-match.dtsi"
+#include "tests-platform.dtsi"
diff --git a/drivers/of/testcase-data/testcases.dtsi b/drivers/of/testcase-data/testcases.dtsi
deleted file mode 100644 (file)
index 6d8d980..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "tests-phandle.dtsi"
-#include "tests-interrupts.dtsi"
-#include "tests-match.dtsi"
-#include "tests-platform.dtsi"
index 93aa29f6d39c5b3a06e699f13f72a8c960b7da34..f2945fa73d4ffeeb5010358b5cc63fce32b73a5a 100644 (file)
@@ -375,11 +375,11 @@ static void __exit cleanup_slots(void)
 
 static int __init rpaphp_init(void)
 {
-       struct device_node *dn = NULL;
+       struct device_node *dn;
 
        info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
-       while ((dn = of_find_node_by_name(dn, "pci")))
+       for_each_node_by_name(dn, "pci")
                rpaphp_add_slot(dn);
 
        return 0;
index 6bbc36fbd6ecc4d642de5705a2c95191d39161f6..e4603985dce372da6107664d0f513db412f28615 100644 (file)
@@ -1,6 +1,6 @@
 config SCSI_CXGB3_ISCSI
        tristate "Chelsio T3 iSCSI support"
-       depends on PCI && INET
+       depends on PCI && INET && (IPV6 || IPV6=n)
        select NETDEVICES
        select ETHERNET
        select NET_VENDOR_CHELSIO
index 16b2c7d26617db2ed9c3f2eb119fcb0bff446c2a..8c4e423037b6ebd1b019c2a76ef3453a173b7c97 100644 (file)
@@ -1,6 +1,6 @@
 config SCSI_CXGB4_ISCSI
        tristate "Chelsio T4 iSCSI support"
-       depends on PCI && INET
+       depends on PCI && INET && (IPV6 || IPV6=n)
        select NETDEVICES
        select ETHERNET
        select NET_VENDOR_CHELSIO
index 43fea2219f8324b0d6277dcab983fe5a8849b09f..ae45bd99baed72662b1c528aef97f3b49e9aeb6a 100644 (file)
@@ -472,7 +472,8 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport)
        if (delay > 0)
                queue_delayed_work(system_long_wq, &rport->reconnect_work,
                                   1UL * delay * HZ);
-       if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
+       if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) &&
+           srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
                pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
                         rport->state);
                scsi_target_block(&shost->shost_gendev);
index 0419b69e270fc7158613cbd78efbddb2e0d45d01..4f485e88f60c51213b96b8ebeac110b5364364da 100644 (file)
@@ -108,55 +108,23 @@ static void disable_tx_interrupt(struct ehv_bc_data *bc)
  *
  * The byte channel to be used for the console is specified via a "stdout"
  * property in the /chosen node.
- *
- * For compatible with legacy device trees, we also look for a "stdout" alias.
  */
 static int find_console_handle(void)
 {
-       struct device_node *np, *np2;
+       struct device_node *np = of_stdout;
        const char *sprop = NULL;
        const uint32_t *iprop;
 
-       np = of_find_node_by_path("/chosen");
-       if (np)
-               sprop = of_get_property(np, "stdout-path", NULL);
-
-       if (!np || !sprop) {
-               of_node_put(np);
-               np = of_find_node_by_name(NULL, "aliases");
-               if (np)
-                       sprop = of_get_property(np, "stdout", NULL);
-       }
-
-       if (!sprop) {
-               of_node_put(np);
-               return 0;
-       }
-
        /* We don't care what the aliased node is actually called.  We only
         * care if it's compatible with "epapr,hv-byte-channel", because that
-        * indicates that it's a byte channel node.  We use a temporary
-        * variable, 'np2', because we can't release 'np' until we're done with
-        * 'sprop'.
+        * indicates that it's a byte channel node.
         */
-       np2 = of_find_node_by_path(sprop);
-       of_node_put(np);
-       np = np2;
-       if (!np) {
-               pr_warning("ehv-bc: stdout node '%s' does not exist\n", sprop);
-               return 0;
-       }
-
-       /* Is it a byte channel? */
-       if (!of_device_is_compatible(np, "epapr,hv-byte-channel")) {
-               of_node_put(np);
+       if (!np || !of_device_is_compatible(np, "epapr,hv-byte-channel"))
                return 0;
-       }
 
        stdout_irq = irq_of_parse_and_map(np, 0);
        if (stdout_irq == NO_IRQ) {
-               pr_err("ehv-bc: no 'interrupts' property in %s node\n", sprop);
-               of_node_put(np);
+               pr_err("ehv-bc: no 'interrupts' property in %s node\n", np->full_name);
                return 0;
        }
 
@@ -167,12 +135,9 @@ static int find_console_handle(void)
        if (!iprop) {
                pr_err("ehv-bc: no 'hv-handle' property in %s node\n",
                       np->name);
-               of_node_put(np);
                return 0;
        }
        stdout_bc = be32_to_cpu(*iprop);
-
-       of_node_put(np);
        return 1;
 }
 
index a585079b4b38822a03770b6d3c6ab8fd0bbea036..a2cc5f834c633836065b215a07f597a4d458fba7 100644 (file)
@@ -342,22 +342,13 @@ static void udbg_init_opal_common(void)
 
 void __init hvc_opal_init_early(void)
 {
-       struct device_node *stdout_node = NULL;
+       struct device_node *stdout_node = of_node_get(of_stdout);
        const __be32 *termno;
-       const char *name = NULL;
        const struct hv_ops *ops;
        u32 index;
 
-       /* find the boot console from /chosen/stdout */
-       if (of_chosen)
-               name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-       if (name) {
-               stdout_node = of_find_node_by_path(name);
-               if (!stdout_node) {
-                       pr_err("hvc_opal: Failed to locate default console!\n");
-                       return;
-               }
-       } else {
+       /* If the console wasn't in /chosen, try /ibm,opal */
+       if (!stdout_node) {
                struct device_node *opal, *np;
 
                /* Current OPAL takeover doesn't provide the stdout
index b594abfbf21e76d58acc1df534a30f603670fb5a..5618b5fc7500e149dfd79ac2515309749f905834 100644 (file)
@@ -404,42 +404,35 @@ module_exit(hvc_vio_exit);
 
 void __init hvc_vio_init_early(void)
 {
-       struct device_node *stdout_node;
        const __be32 *termno;
        const char *name;
        const struct hv_ops *ops;
 
        /* find the boot console from /chosen/stdout */
-       if (!of_chosen)
+       if (!of_stdout)
                return;
-       name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-       if (name == NULL)
-               return;
-       stdout_node = of_find_node_by_path(name);
-       if (!stdout_node)
-               return;
-       name = of_get_property(stdout_node, "name", NULL);
+       name = of_get_property(of_stdout, "name", NULL);
        if (!name) {
                printk(KERN_WARNING "stdout node missing 'name' property!\n");
-               goto out;
+               return;
        }
 
        /* Check if it's a virtual terminal */
        if (strncmp(name, "vty", 3) != 0)
-               goto out;
-       termno = of_get_property(stdout_node, "reg", NULL);
+               return;
+       termno = of_get_property(of_stdout, "reg", NULL);
        if (termno == NULL)
-               goto out;
+               return;
        hvterm_priv0.termno = of_read_number(termno, 1);
        spin_lock_init(&hvterm_priv0.buf_lock);
        hvterm_privs[0] = &hvterm_priv0;
 
        /* Check the protocol */
-       if (of_device_is_compatible(stdout_node, "hvterm1")) {
+       if (of_device_is_compatible(of_stdout, "hvterm1")) {
                hvterm_priv0.proto = HV_PROTOCOL_RAW;
                ops = &hvterm_raw_ops;
        }
-       else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) {
+       else if (of_device_is_compatible(of_stdout, "hvterm-protocol")) {
                hvterm_priv0.proto = HV_PROTOCOL_HVSI;
                ops = &hvterm_hvsi_ops;
                hvsilib_init(&hvterm_priv0.hvsi, hvc_get_chars, hvc_put_chars,
@@ -447,7 +440,7 @@ void __init hvc_vio_init_early(void)
                /* HVSI, perform the handshake now */
                hvsilib_establish(&hvterm_priv0.hvsi);
        } else
-               goto out;
+               return;
        udbg_putc = udbg_hvc_putc;
        udbg_getc = udbg_hvc_getc;
        udbg_getc_poll = udbg_hvc_getc_poll;
@@ -456,14 +449,12 @@ void __init hvc_vio_init_early(void)
         * backend for HVSI, only do udbg
         */
        if (hvterm_priv0.proto == HV_PROTOCOL_HVSI)
-               goto out;
+               return;
 #endif
        /* Check whether the user has requested a different console. */
        if (!strstr(cmd_line, "console="))
                add_preferred_console("hvc", 0, NULL);
        hvc_instantiate(0, 0, ops);
-out:
-       of_node_put(stdout_node);
 }
 
 /* call this from early_init() for a working debug console on
index f7ad5b903055852fad68d2a69a2f826e5f795172..abbfedb84901731793aeddef5a98910c72520b42 100644 (file)
@@ -1653,8 +1653,7 @@ static int __init pmz_probe(void)
        /*
         * Find all escc chips in the system
         */
-       node_p = of_find_node_by_name(NULL, "escc");
-       while (node_p) {
+       for_each_node_by_name(node_p, "escc") {
                /*
                 * First get channel A/B node pointers
                 * 
@@ -1672,7 +1671,7 @@ static int __init pmz_probe(void)
                        of_node_put(node_b);
                        printk(KERN_ERR "pmac_zilog: missing node %c for escc %s\n",
                                (!node_a) ? 'a' : 'b', node_p->full_name);
-                       goto next;
+                       continue;
                }
 
                /*
@@ -1699,11 +1698,9 @@ static int __init pmz_probe(void)
                        of_node_put(node_b);
                        memset(&pmz_ports[count], 0, sizeof(struct uart_pmac_port));
                        memset(&pmz_ports[count+1], 0, sizeof(struct uart_pmac_port));
-                       goto next;
+                       continue;
                }
                count += 2;
-next:
-               node_p = of_find_node_by_name(node_p, "escc");
        }
        pmz_ports_count = count;
 
index 8bb19da01639bb51cc2bc88f40cada8f686cc5a2..29a7be47389a9339fe2c049f7d3ee2cfd6160397 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/of.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
@@ -2611,6 +2612,8 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
                spin_lock_init(&uport->lock);
                lockdep_set_class(&uport->lock, &port_lock_key);
        }
+       if (uport->cons && uport->dev)
+               of_console_check(uport->dev->of_node, uport->cons->name, uport->line);
 
        uart_configure_port(drv, state, uport);
 
index af7b204b921555ec980d64b09b26543d8932c890..d8c57636b9ce8750d128169e85b1c9ce3094219a 100644 (file)
@@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
        depends on VFIO && SPAPR_TCE_IOMMU
        default n
 
+config VFIO_SPAPR_EEH
+       tristate
+       depends on EEH && VFIO_IOMMU_SPAPR_TCE
+       default n
+
 menuconfig VFIO
        tristate "VFIO Non-Privileged userspace driver framework"
        depends on IOMMU_API
        select VFIO_IOMMU_TYPE1 if X86
        select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
+       select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
        select ANON_INODES
        help
          VFIO provides a framework for secure userspace device drivers.
index 50e30bc75e855f6564bd10a35d88a4040b6ee3e7..0b035b12600a7ba6aeaa8d16367fdfbe4f78c9da 100644 (file)
@@ -1,5 +1,5 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
-obj-$(CONFIG_EEH) += vfio_spapr_eeh.o
+obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
index e2ee80f36e3ea2d4a845e1e7b22ce47b070dc9c8..f7825332a3251b2001645b581eec09524562fbec 100644 (file)
@@ -37,6 +37,10 @@ module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(nointxmask,
                  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
 
+static DEFINE_MUTEX(driver_lock);
+
+static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
@@ -44,6 +48,9 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
        u16 cmd;
        u8 msix_pos;
 
+       /* Don't allow our initial saved state to include busmaster */
+       pci_clear_master(pdev);
+
        ret = pci_enable_device(pdev);
        if (ret)
                return ret;
@@ -99,7 +106,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
        struct pci_dev *pdev = vdev->pdev;
        int bar;
 
-       pci_disable_device(pdev);
+       /* Stop the device from further DMA */
+       pci_clear_master(pdev);
 
        vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
                                VFIO_IRQ_SET_ACTION_TRIGGER,
@@ -117,6 +125,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                vdev->barmap[bar] = NULL;
        }
 
+       vdev->needs_reset = true;
+
        /*
         * If we have saved state, restore it.  If we can reset the device,
         * even better.  Resetting with current state seems better than
@@ -128,7 +138,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                        __func__, dev_name(&pdev->dev));
 
                if (!vdev->reset_works)
-                       return;
+                       goto out;
 
                pci_save_state(pdev);
        }
@@ -148,46 +158,55 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
                if (ret)
                        pr_warn("%s: Failed to reset device %s (%d)\n",
                                __func__, dev_name(&pdev->dev), ret);
+               else
+                       vdev->needs_reset = false;
        }
 
        pci_restore_state(pdev);
+out:
+       pci_disable_device(pdev);
+
+       vfio_pci_try_bus_reset(vdev);
 }
 
 static void vfio_pci_release(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
 
-       if (atomic_dec_and_test(&vdev->refcnt)) {
+       mutex_lock(&driver_lock);
+
+       if (!(--vdev->refcnt)) {
                vfio_spapr_pci_eeh_release(vdev->pdev);
                vfio_pci_disable(vdev);
        }
 
+       mutex_unlock(&driver_lock);
+
        module_put(THIS_MODULE);
 }
 
 static int vfio_pci_open(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
-       int ret;
+       int ret = 0;
 
        if (!try_module_get(THIS_MODULE))
                return -ENODEV;
 
-       if (atomic_inc_return(&vdev->refcnt) == 1) {
+       mutex_lock(&driver_lock);
+
+       if (!vdev->refcnt) {
                ret = vfio_pci_enable(vdev);
                if (ret)
                        goto error;
 
-               ret = vfio_spapr_pci_eeh_open(vdev->pdev);
-               if (ret) {
-                       vfio_pci_disable(vdev);
-                       goto error;
-               }
+               vfio_spapr_pci_eeh_open(vdev->pdev);
        }
-
-       return 0;
+       vdev->refcnt++;
 error:
-       module_put(THIS_MODULE);
+       mutex_unlock(&driver_lock);
+       if (ret)
+               module_put(THIS_MODULE);
        return ret;
 }
 
@@ -843,7 +862,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        vdev->irq_type = VFIO_PCI_NUM_IRQS;
        mutex_init(&vdev->igate);
        spin_lock_init(&vdev->irqlock);
-       atomic_set(&vdev->refcnt, 0);
 
        ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
        if (ret) {
@@ -858,12 +876,15 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 {
        struct vfio_pci_device *vdev;
 
+       mutex_lock(&driver_lock);
+
        vdev = vfio_del_group_dev(&pdev->dev);
-       if (!vdev)
-               return;
+       if (vdev) {
+               iommu_group_put(pdev->dev.iommu_group);
+               kfree(vdev);
+       }
 
-       iommu_group_put(pdev->dev.iommu_group);
-       kfree(vdev);
+       mutex_unlock(&driver_lock);
 }
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -906,6 +927,110 @@ static struct pci_driver vfio_pci_driver = {
        .err_handler    = &vfio_err_handlers,
 };
 
+/*
+ * Test whether a reset is necessary and possible.  We mark devices as
+ * needs_reset when they are released, but don't have a function-local reset
+ * available.  If any of these exist in the affected devices, we want to do
+ * a bus/slot reset.  We also need all of the affected devices to be unused,
+ * so we abort if any device has a non-zero refcnt.  driver_lock prevents a
+ * device from being opened during the scan or unbound from vfio-pci.
+ */
+static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data)
+{
+       bool *needs_reset = data;
+       struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
+       int ret = -EBUSY;
+
+       if (pci_drv == &vfio_pci_driver) {
+               struct vfio_device *device;
+               struct vfio_pci_device *vdev;
+
+               device = vfio_device_get_from_dev(&pdev->dev);
+               if (!device)
+                       return ret;
+
+               vdev = vfio_device_data(device);
+               if (vdev) {
+                       if (vdev->needs_reset)
+                               *needs_reset = true;
+
+                       if (!vdev->refcnt)
+                               ret = 0;
+               }
+
+               vfio_device_put(device);
+       }
+
+       /*
+        * TODO: vfio-core considers groups to be viable even if some devices
+        * are attached to known drivers, like pci-stub or pcieport.  We can't
+        * freeze devices from being unbound to those drivers like we can
+        * here though, so it would be racy to test for them.  We also can't
+        * use device_lock() to prevent changes as that would interfere with
+        * PCI-core taking device_lock during bus reset.  For now, we require
+        * devices to be bound to vfio-pci to get a bus/slot reset on release.
+        */
+
+       return ret;
+}
+
+/* Clear needs_reset on all affected devices after successful bus/slot reset */
+static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data)
+{
+       struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
+
+       if (pci_drv == &vfio_pci_driver) {
+               struct vfio_device *device;
+               struct vfio_pci_device *vdev;
+
+               device = vfio_device_get_from_dev(&pdev->dev);
+               if (!device)
+                       return 0;
+
+               vdev = vfio_device_data(device);
+               if (vdev)
+                       vdev->needs_reset = false;
+
+               vfio_device_put(device);
+       }
+
+       return 0;
+}
+
+/*
+ * Attempt to do a bus/slot reset if there are devices affected by a reset for
+ * this device that are needs_reset and all of the affected devices are unused
+ * (!refcnt).  Callers of this function are required to hold driver_lock such
+ * that devices can not be unbound from vfio-pci or opened by a user while we
+ * test for and perform a bus/slot reset.
+ */
+static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
+{
+       bool needs_reset = false, slot = false;
+       int ret;
+
+       if (!pci_probe_reset_slot(vdev->pdev->slot))
+               slot = true;
+       else if (pci_probe_reset_bus(vdev->pdev->bus))
+               return;
+
+       if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
+                                         vfio_pci_test_bus_reset,
+                                         &needs_reset, slot) || !needs_reset)
+               return;
+
+       if (slot)
+               ret = pci_try_reset_slot(vdev->pdev->slot);
+       else
+               ret = pci_try_reset_bus(vdev->pdev->bus);
+
+       if (ret)
+               return;
+
+       vfio_pci_for_each_slot_or_bus(vdev->pdev,
+                                     vfio_pci_clear_needs_reset, NULL, slot);
+}
+
 static void __exit vfio_pci_cleanup(void)
 {
        pci_unregister_driver(&vfio_pci_driver);
index 9c6d5d0f3b02db8cfb1acdd418b4a5abb44e12e5..671c17a6e6d029dfdffe5d7243150ed757e44cf4 100644 (file)
@@ -54,8 +54,9 @@ struct vfio_pci_device {
        bool                    extended_caps;
        bool                    bardirty;
        bool                    has_vga;
+       bool                    needs_reset;
        struct pci_saved_state  *pci_saved_state;
-       atomic_t                refcnt;
+       int                     refcnt;
        struct eventfd_ctx      *err_trigger;
 };
 
index f834b4ce1431b133c1fa5aa6c7f6187c420a0ad8..86dfceb9201f2503e2050dc71c8906fcd8abf281 100644 (file)
@@ -9,20 +9,27 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <asm/eeh.h>
 
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR  "Gavin Shan, IBM Corporation"
+#define DRIVER_DESC    "VFIO IOMMU SPAPR EEH"
+
 /* We might build address mapping here for "fast" path later */
-int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
-       return eeh_dev_open(pdev);
+       eeh_dev_open(pdev);
 }
+EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open);
 
 void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
 {
        eeh_dev_release(pdev);
 }
+EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release);
 
 long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
                                unsigned int cmd, unsigned long arg)
@@ -85,3 +92,9 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 
        return ret;
 }
+EXPORT_SYMBOL(vfio_spapr_iommu_eeh_ioctl);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
index d2633ee099d975836b1fc457cd58e323bfcc8784..b39e5000ff589e6dfdc556c0b80390005fc5718b 100644 (file)
@@ -308,6 +308,7 @@ struct bio_integrity_payload {
 
        unsigned short          bip_slab;       /* slab the bip came from */
        unsigned short          bip_vcnt;       /* # of integrity bio_vecs */
+       unsigned short          bip_max_vcnt;   /* integrity bio_vec slots */
        unsigned                bip_owns_buf:1; /* should free bip_buf */
 
        struct work_struct      bip_work;       /* I/O completion */
index 8699bcf5f0999db98a8f2a2917c284d950a12d75..518b46555b80968c3d29df956f677763fe292c51 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
 
 #include <asm/scatterlist.h>
 
@@ -470,6 +471,7 @@ struct request_queue {
        struct mutex            sysfs_lock;
 
        int                     bypass_depth;
+       int                     mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
        bsg_job_fn              *bsg_job_fn;
@@ -483,7 +485,7 @@ struct request_queue {
 #endif
        struct rcu_head         rcu_head;
        wait_queue_head_t       mq_freeze_wq;
-       struct percpu_counter   mq_usage_counter;
+       struct percpu_ref       mq_usage_counter;
        struct list_head        all_q_node;
 
        struct blk_mq_tag_set   *tag_set;
index 3dbe9bd57a094b9b63175de44d6f9fd1022c322b..debb70d4054757e44e291064fd73d362e738b68b 100644 (file)
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.5"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -245,7 +245,7 @@ enum drbd_disk_state {
        D_DISKLESS,
        D_ATTACHING,      /* In the process of reading the meta-data */
        D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
-                       /* when >= D_FAILED it is legal to access mdev->bc */
+                         /* when >= D_FAILED it is legal to access mdev->ldev */
        D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
        D_INCONSISTENT,
        D_OUTDATED,
index 4193f5f2636c011686cb105c2bba428fc88be2c5..7b131ed8f9c6696cfb1ec8b470c0d77c95dff07e 100644 (file)
@@ -171,6 +171,10 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
        __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,      tentative)
        __flg_field_def(29,     DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
        /* 9: __u32_field_def(30,       DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
+       /* 9: __str_field_def(31,     DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
+       /* 9: __u32_field(32,         DRBD_F_REQUIRED | DRBD_F_INVARIANT,     peer_node_id) */
+       __flg_field_def(33, 0 /* OPTIONAL */,   csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
+       __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF)
 )
 
 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
index 17e50bb00521362f0840eae9e79da52e2b34d139..8ac8c5d9a3ad08482bee1bcd0e5d4c57603436a7 100644 (file)
 #define DRBD_ALLOW_TWO_PRIMARIES_DEF   0
 #define DRBD_ALWAYS_ASBP_DEF   0
 #define DRBD_USE_RLE_DEF       1
+#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0
 
 #define DRBD_AL_STRIPES_MIN     1
 #define DRBD_AL_STRIPES_MAX     1024
 #define DRBD_AL_STRIPE_SIZE_MAX   16777216
 #define DRBD_AL_STRIPE_SIZE_DEF   32
 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
+
+#define DRBD_SOCKET_CHECK_TIMEO_MIN 0
+#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
+#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
+#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
 #endif
index c8450366c13019fe0ec0343487cd5ef6a8e2855f..379c02648ab3b36029ba459534416ebe224cc282 100644 (file)
@@ -116,6 +116,7 @@ enum {
        /* special QP and management commands */
        MLX4_CMD_CONF_SPECIAL_QP = 0x23,
        MLX4_CMD_MAD_IFC         = 0x24,
+       MLX4_CMD_MAD_DEMUX       = 0x203,
 
        /* multicast commands */
        MLX4_CMD_READ_MCG        = 0x25,
@@ -185,6 +186,12 @@ enum {
        MLX4_SET_PORT_VXLAN     = 0xB
 };
 
+enum {
+       MLX4_CMD_MAD_DEMUX_CONFIG       = 0,
+       MLX4_CMD_MAD_DEMUX_QUERY_STATE  = 1,
+       MLX4_CMD_MAD_DEMUX_QUERY_RESTR  = 2, /* Query mad demux restrictions */
+};
+
 enum {
        MLX4_CMD_WRAPPED,
        MLX4_CMD_NATIVE
index e15b1544ea836716b7e6f9588ac1ecf9d486fc55..071f6b234604cf770ebfab55e1ec61392e364a83 100644 (file)
@@ -183,6 +183,7 @@ enum {
        MLX4_DEV_CAP_FLAG2_UPDATE_QP            = 1LL <<  8,
        MLX4_DEV_CAP_FLAG2_DMFS_IPOIB           = 1LL <<  9,
        MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS       = 1LL <<  10,
+       MLX4_DEV_CAP_FLAG2_MAD_DEMUX            = 1LL <<  11,
 };
 
 enum {
@@ -273,6 +274,7 @@ enum {
        MLX4_PERM_REMOTE_WRITE  = 1 << 13,
        MLX4_PERM_ATOMIC        = 1 << 14,
        MLX4_PERM_BIND_MW       = 1 << 15,
+       MLX4_PERM_MASK          = 0xFC00
 };
 
 enum {
@@ -1254,6 +1256,21 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
                                 int enable);
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+                      struct mlx4_mpt_entry ***mpt_entry);
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+                        struct mlx4_mpt_entry **mpt_entry);
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+                        u32 pdn);
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+                            struct mlx4_mpt_entry *mpt_entry,
+                            u32 access);
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+                       struct mlx4_mpt_entry **mpt_entry);
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+                           u64 iova, u64 size, int npages,
+                           int page_shift, struct mlx4_mpt_entry *mpt_entry);
 
 /* Returns true if running in low memory profile (kdump kernel) */
 static inline bool mlx4_low_memory_profile(void)
index babaea93bca646d302ae19435e468e9ba68b0042..29ce014ab42139cef63da0c6bc78b3d1ca9dd060 100644 (file)
@@ -213,6 +213,8 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED                 BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION     BIT(3)
+/* No write protect */
+#define DW_MCI_QUIRK_NO_WRITE_PROTECT          BIT(4)
 
 /* Slot level quirks */
 /* This slot has no write protect */
index 08abe9941884ebc40667ee246881b2b2bfd66f70..09ebe57d5ce9b4a810d82b7900dea772b99abf02 100644 (file)
@@ -104,9 +104,6 @@ struct sdhci_host {
 
        const struct sdhci_ops *ops;    /* Low level hw interface */
 
-       struct regulator *vmmc;         /* Power regulator (vmmc) */
-       struct regulator *vqmmc;        /* Signaling regulator (vccq) */
-
        /* Internal data */
        struct mmc_host *mmc;   /* MMC structure */
        u64 dma_mask;           /* custom DMA mask */
index 196b34c1ef4e1b3bdd54cb851110715c584b43df..6c4363b8ddc3ddba1a75d9f2396f453ae92ab5e9 100644 (file)
@@ -74,8 +74,6 @@ struct of_phandle_args {
        uint32_t args[MAX_PHANDLE_ARGS];
 };
 
-extern int of_node_add(struct device_node *node);
-
 /* initialize a node */
 extern struct kobj_type of_node_ktype;
 static inline void of_node_init(struct device_node *node)
@@ -113,6 +111,7 @@ static inline void of_node_put(struct device_node *node) { }
 extern struct device_node *of_allnodes;
 extern struct device_node *of_chosen;
 extern struct device_node *of_aliases;
+extern struct device_node *of_stdout;
 extern raw_spinlock_t devtree_lock;
 
 static inline bool of_have_populated_dt(void)
@@ -204,6 +203,7 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 #define OF_DYNAMIC     1 /* node and properties were allocated via kmalloc */
 #define OF_DETACHED    2 /* node has been detached from the device tree */
 #define OF_POPULATED   3 /* device already created for the node */
+#define OF_POPULATED_BUS       4 /* of_platform_populate recursed to children of this node */
 
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
@@ -322,6 +322,7 @@ extern int of_update_property(struct device_node *np, struct property *newprop);
 struct of_prop_reconfig {
        struct device_node      *dn;
        struct property         *prop;
+       struct property         *old_prop;
 };
 
 extern int of_reconfig_notifier_register(struct notifier_block *);
@@ -352,7 +353,7 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
  */
 const char *of_prop_next_string(struct property *prop, const char *cur);
 
-int of_device_is_stdout_path(struct device_node *dn);
+bool of_console_check(struct device_node *dn, char *name, int index);
 
 #else /* CONFIG_OF */
 
@@ -564,9 +565,9 @@ static inline int of_machine_is_compatible(const char *compat)
        return 0;
 }
 
-static inline int of_device_is_stdout_path(struct device_node *dn)
+static inline bool of_console_check(const struct device_node *dn, const char *name, int index)
 {
-       return 0;
+       return false;
 }
 
 static inline const __be32 *of_prop_next_u32(struct property *prop,
@@ -786,4 +787,80 @@ typedef void (*of_init_fn_1)(struct device_node *);
 #define OF_DECLARE_2(table, name, compat, fn) \
                _OF_DECLARE(table, name, compat, fn, of_init_fn_2)
 
+/**
+ * struct of_changeset_entry   - Holds a changeset entry
+ *
+ * @node:      list_head for the log list
+ * @action:    notifier action
+ * @np:                pointer to the device node affected
+ * @prop:      pointer to the property affected
+ * @old_prop:  hold a pointer to the original property
+ *
+ * Every modification of the device tree during a changeset
+ * is held in a list of of_changeset_entry structures.
+ * That way we can recover from a partial application, or we can
+ * revert the changeset
+ */
+struct of_changeset_entry {
+       struct list_head node;
+       unsigned long action;
+       struct device_node *np;
+       struct property *prop;
+       struct property *old_prop;
+};
+
+/**
+ * struct of_changeset - changeset tracker structure
+ *
+ * @entries:   list_head for the changeset entries
+ *
+ * changesets are a convenient way to apply bulk changes to the
+ * live tree. In case of an error, changes are rolled-back.
+ * changesets live on after initial application, and if not
+ * destroyed after use, they can be reverted in one single call.
+ */
+struct of_changeset {
+       struct list_head entries;
+};
+
+#ifdef CONFIG_OF_DYNAMIC
+extern void of_changeset_init(struct of_changeset *ocs);
+extern void of_changeset_destroy(struct of_changeset *ocs);
+extern int of_changeset_apply(struct of_changeset *ocs);
+extern int of_changeset_revert(struct of_changeset *ocs);
+extern int of_changeset_action(struct of_changeset *ocs,
+               unsigned long action, struct device_node *np,
+               struct property *prop);
+
+static inline int of_changeset_attach_node(struct of_changeset *ocs,
+               struct device_node *np)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_ATTACH_NODE, np, NULL);
+}
+
+static inline int of_changeset_detach_node(struct of_changeset *ocs,
+               struct device_node *np)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_DETACH_NODE, np, NULL);
+}
+
+static inline int of_changeset_add_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_ADD_PROPERTY, np, prop);
+}
+
+static inline int of_changeset_remove_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_REMOVE_PROPERTY, np, prop);
+}
+
+static inline int of_changeset_update_property(struct of_changeset *ocs,
+               struct device_node *np, struct property *prop)
+{
+       return of_changeset_action(ocs, OF_RECONFIG_UPDATE_PROPERTY, np, prop);
+}
+#endif
+
 #endif /* _LINUX_OF_H */
index d96e1badbee05b31538df9c34b1c298a1cc2ca1e..c2b0627a23175380b3b1e5b3278c1e5663491469 100644 (file)
@@ -72,7 +72,7 @@ extern int of_platform_populate(struct device_node *root,
                                const struct of_device_id *matches,
                                const struct of_dev_auxdata *lookup,
                                struct device *parent);
-extern int of_platform_depopulate(struct device *parent);
+extern void of_platform_depopulate(struct device *parent);
 #else
 static inline int of_platform_populate(struct device_node *root,
                                        const struct of_device_id *matches,
@@ -81,10 +81,7 @@ static inline int of_platform_populate(struct device_node *root,
 {
        return -ENODEV;
 }
-static inline int of_platform_depopulate(struct device *parent)
-{
-       return -ENODEV;
-}
+static inline void of_platform_depopulate(struct device *parent) { }
 #endif
 
 #endif /* _LINUX_OF_PLATFORM_H */
index 4669ddfdd5af5b10edf5fa92ea9bde133bf0b4bf..5b5efae091350a839e5b27ff5c5b5b34406e9f29 100644 (file)
@@ -8,6 +8,7 @@ struct reserved_mem_ops;
 struct reserved_mem {
        const char                      *name;
        unsigned long                   fdt_node;
+       unsigned long                   phandle;
        const struct reserved_mem_ops   *ops;
        phys_addr_t                     base;
        phys_addr_t                     size;
@@ -27,10 +28,16 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem);
        _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn)
 
 #ifdef CONFIG_OF_RESERVED_MEM
+void of_reserved_mem_device_init(struct device *dev);
+void of_reserved_mem_device_release(struct device *dev);
+
 void fdt_init_reserved_mem(void);
 void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
                               phys_addr_t base, phys_addr_t size);
 #else
+static inline void of_reserved_mem_device_init(struct device *dev) { }
+static inline void of_reserved_mem_device_release(struct device *pdev) { }
+
 static inline void fdt_init_reserved_mem(void) { }
 static inline void fdt_reserved_mem_save_node(unsigned long node,
                const char *uname, phys_addr_t base, phys_addr_t size) { }
index 2bf1b30cb5dcf9fe13d667a9c5da5dff2760260c..51e70cf25cbcb3476999372baeb605f0e96261dd 100644 (file)
@@ -28,6 +28,7 @@
  */
 #define OMAP_HSMMC_SUPPORTS_DUAL_VOLT          BIT(0)
 #define OMAP_HSMMC_BROKEN_MULTIBLOCK_READ      BIT(1)
+#define OMAP_HSMMC_SWAKEUP_MISSING             BIT(2)
 
 struct mmc_card;
 
index 0990997a5304bb13f798bd79cd96097eb17681e5..d78125f73ac4f9e3838c2449921bb69301c02261 100644 (file)
@@ -10,6 +10,9 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
+extern char *log_buf_addr_get(void);
+extern u32 log_buf_len_get(void);
+
 static inline int printk_get_level(const char *buffer)
 {
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
index 9cda293c867dd0c7752bf03a2ae30df044898e51..36826c0166c5f5af0d3a8e7601944f0e93946e09 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/rculist.h>
 
 struct rhash_head {
-       struct rhash_head               *next;
+       struct rhash_head __rcu         *next;
 };
 
 #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
@@ -97,7 +97,7 @@ u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-                            struct rhash_head **pprev, gfp_t flags);
+                            struct rhash_head __rcu **pprev, gfp_t flags);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
@@ -117,18 +117,12 @@ void rhashtable_destroy(const struct rhashtable *ht);
 #define rht_dereference_rcu(p, ht) \
        rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
 
-/* Internal, use rht_obj() instead */
 #define rht_entry(ptr, type, member) container_of(ptr, type, member)
 #define rht_entry_safe(ptr, type, member) \
 ({ \
        typeof(ptr) __ptr = (ptr); \
           __ptr ? rht_entry(__ptr, type, member) : NULL; \
 })
-#define rht_entry_safe_rcu(ptr, type, member) \
-({ \
-       typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
-       __ptr ? container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member) : NULL; \
-})
 
 #define rht_next_entry_safe(pos, ht, member) \
 ({ \
@@ -205,9 +199,10 @@ void rhashtable_destroy(const struct rhashtable *ht);
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(pos, head, member) \
-       for (pos = rht_entry_safe_rcu(head, typeof(*(pos)), member); \
+       for (pos = rht_entry_safe(rcu_dereference_raw(head), \
+                                 typeof(*(pos)), member); \
             pos; \
-            pos = rht_entry_safe_rcu((pos)->member.next, \
-                                     typeof(*(pos)), member))
+            pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \
+                                 typeof(*(pos)), member))
 
 #endif /* _LINUX_RHASHTABLE_H */
index 25a0fbd4b998f7572b8d8d705faf761b5c20b5fe..d3204115f15d21dd7ef3d879df2393884795b037 100644 (file)
@@ -98,16 +98,16 @@ extern int vfio_external_user_iommu_id(struct vfio_group *group);
 extern long vfio_external_check_extension(struct vfio_group *group,
                                          unsigned long arg);
 
+struct pci_dev;
 #ifdef CONFIG_EEH
-extern int vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
 extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
 extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
                                       unsigned int cmd,
                                       unsigned long arg);
 #else
-static inline int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
+static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
-       return 0;
 }
 
 static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
index 7a4313887568796546e5382581fa8bca773c1f27..5fbe6568c3cff7b025d4957cc28df4b77f051dc6 100644 (file)
@@ -62,6 +62,7 @@ struct inet_connection_sock_af_ops {
        void        (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
        int         (*bind_conflict)(const struct sock *sk,
                                     const struct inet_bind_bucket *tb, bool relax);
+       void        (*mtu_reduced)(struct sock *sk);
 };
 
 /** inet_connection_sock - INET connection oriented sock
index 38805fa02e48c466d57d80343abf7a1a93126ee6..7f2ab72f321a4bd437800ab551a5aff3a9eab44a 100644 (file)
@@ -987,7 +987,6 @@ struct proto {
                                                struct sk_buff *skb);
 
        void            (*release_cb)(struct sock *sk);
-       void            (*mtu_reduced)(struct sock *sk);
 
        /* Keeping track of sk's, looking them up, and port selection methods. */
        void                    (*hash)(struct sock *sk);
index dafa1cbc149bc54a6e86db9cc69c2c2132f86513..590e01a476acc913464322e8e54a1858d78b3bf7 100644 (file)
@@ -417,7 +417,7 @@ void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
-                       bool paws_check);
+                       bool paws_check, bool timestamps);
 bool tcp_remember_stamp(struct sock *sk);
 bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
@@ -448,6 +448,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
  */
 
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_mtu_reduced(struct sock *sk);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(struct sock *sk,
                                      struct request_sock *req,
@@ -705,8 +706,10 @@ struct tcp_skb_cb {
 #define TCPCB_SACKED_RETRANS   0x02    /* SKB retransmitted            */
 #define TCPCB_LOST             0x04    /* SKB is lost                  */
 #define TCPCB_TAGBITS          0x07    /* All tag bits                 */
+#define TCPCB_REPAIRED         0x10    /* SKB repaired (no skb_mstamp) */
 #define TCPCB_EVER_RETRANS     0x80    /* Ever retransmitted frame     */
-#define TCPCB_RETRANS          (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+#define TCPCB_RETRANS          (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
+                               TCPCB_REPAIRED)
 
        __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
        /* 1 byte hole */
index 3d81b90cc31588b815167c109fec65cf50865538..9bb99e983f583d28fd81d978484022495f7744fc 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/list.h>
 
 #include <rdma/ib_verbs.h>
+#include <uapi/rdma/ib_user_mad.h>
 
 /* Management base version */
 #define IB_MGMT_BASE_VERSION                   1
@@ -355,9 +356,13 @@ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
  * @hi_tid: Access layer assigned transaction ID for this client.
  *   Unsolicited MADs sent by this client will have the upper 32-bits
  *   of their TID set to this value.
+ * @flags: registration flags
  * @port_num: Port number on which QP is registered
  * @rmpp_version: If set, indicates the RMPP version used by this agent.
  */
+enum {
+       IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP,
+};
 struct ib_mad_agent {
        struct ib_device        *device;
        struct ib_qp            *qp;
@@ -367,6 +372,7 @@ struct ib_mad_agent {
        ib_mad_snoop_handler    snoop_handler;
        void                    *context;
        u32                     hi_tid;
+       u32                     flags;
        u8                      port_num;
        u8                      rmpp_version;
 };
@@ -426,6 +432,7 @@ struct ib_mad_recv_wc {
  *   in the range from 0x30 to 0x4f. Otherwise not used.
  * @method_mask: The caller will receive unsolicited MADs for any method
  *   where @method_mask = 1.
+ *
  */
 struct ib_mad_reg_req {
        u8      mgmt_class;
@@ -451,6 +458,7 @@ struct ib_mad_reg_req {
  * @recv_handler: The completion callback routine invoked for a received
  *   MAD.
  * @context: User specified context associated with the registration.
+ * @registration_flags: Registration flags to set for this agent
  */
 struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
                                           u8 port_num,
@@ -459,7 +467,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
                                           u8 rmpp_version,
                                           ib_mad_send_handler send_handler,
                                           ib_mad_recv_handler recv_handler,
-                                          void *context);
+                                          void *context,
+                                          u32 registration_flags);
 
 enum ib_mad_snoop_flags {
        /*IB_MAD_SNOOP_POSTED_SENDS        = 1,*/
@@ -661,4 +670,11 @@ void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
  */
 void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
 
+/**
+ * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP.
+ * @agent: the agent in question
+ * @return: true if agent is performing rmpp, false otherwise.
+ */
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent);
+
 #endif /* IB_MAD_H */
index 7ccef342f72420a1c54a14341793a99ff59bd047..ed44cc07a7b3d8659eecf8a99aefb846b37d7f74 100644 (file)
@@ -1097,7 +1097,8 @@ struct ib_mr_attr {
 enum ib_mr_rereg_flags {
        IB_MR_REREG_TRANS       = 1,
        IB_MR_REREG_PD          = (1<<1),
-       IB_MR_REREG_ACCESS      = (1<<2)
+       IB_MR_REREG_ACCESS      = (1<<2),
+       IB_MR_REREG_SUPPORTED   = ((IB_MR_REREG_ACCESS << 1) - 1)
 };
 
 /**
@@ -1547,6 +1548,13 @@ struct ib_device {
                                                  u64 virt_addr,
                                                  int mr_access_flags,
                                                  struct ib_udata *udata);
+       int                        (*rereg_user_mr)(struct ib_mr *mr,
+                                                   int flags,
+                                                   u64 start, u64 length,
+                                                   u64 virt_addr,
+                                                   int mr_access_flags,
+                                                   struct ib_pd *pd,
+                                                   struct ib_udata *udata);
        int                        (*query_mr)(struct ib_mr *mr,
                                               struct ib_mr_attr *mr_attr);
        int                        (*dereg_mr)(struct ib_mr *mr);
index 9859355a7cf9944a0fd27726b602733b2eea243b..750e5db7c6bff10b04e7d5ee8853d244cb7fc610 100644 (file)
@@ -86,7 +86,9 @@ typedef struct sg_io_hdr
 #define SG_FLAG_MMAP_IO 4       /* request memory mapped IO */
 #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */
                                /* user space (debug indirect IO) */
-#define SG_FLAG_Q_AT_TAIL 0x10  /* default is Q_AT_HEAD */
+/* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */
+#define SG_FLAG_Q_AT_TAIL 0x10
+#define SG_FLAG_Q_AT_HEAD 0x20
 
 /* following 'info' values are "or"-ed together */
 #define SG_INFO_OK_MASK 0x1
index c9c3c044b32f060b749c63381b956fceabc64f89..981acf74b14f1fdbf00fc2c7bd915c82a9b37c3c 100644 (file)
@@ -148,11 +148,13 @@ TRACE_EVENT(bcache_read,
 );
 
 TRACE_EVENT(bcache_write,
-       TP_PROTO(struct bio *bio, bool writeback, bool bypass),
-       TP_ARGS(bio, writeback, bypass),
+       TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio,
+               bool writeback, bool bypass),
+       TP_ARGS(c, inode, bio, writeback, bypass),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
+               __array(char,           uuid,   16              )
+               __field(u64,            inode                   )
                __field(sector_t,       sector                  )
                __field(unsigned int,   nr_sector               )
                __array(char,           rwbs,   6               )
@@ -161,7 +163,8 @@ TRACE_EVENT(bcache_write,
        ),
 
        TP_fast_assign(
-               __entry->dev            = bio->bi_bdev->bd_dev;
+               memcpy(__entry->uuid, c->sb.set_uuid, 16);
+               __entry->inode          = inode;
                __entry->sector         = bio->bi_iter.bi_sector;
                __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
@@ -169,8 +172,8 @@ TRACE_EVENT(bcache_write,
                __entry->bypass = bypass;
        ),
 
-       TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
+       TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
+                 __entry->uuid, __entry->inode,
                  __entry->rwbs, (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->writeback, __entry->bypass)
 );
@@ -258,9 +261,9 @@ DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
        TP_ARGS(b)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
-       TP_PROTO(struct btree *b),
-       TP_ARGS(b)
+DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail,
+       TP_PROTO(struct cache_set *c),
+       TP_ARGS(c)
 );
 
 DEFINE_EVENT(btree_node, bcache_btree_node_free,
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
new file mode 100644 (file)
index 0000000..b59b065
--- /dev/null
@@ -0,0 +1,88 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM thp
+
+#if !defined(_TRACE_THP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_THP_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(hugepage_invalidate,
+
+           TP_PROTO(unsigned long addr, unsigned long pte),
+           TP_ARGS(addr, pte),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   __field(unsigned long, pte)
+                   ),
+
+           TP_fast_assign(
+                   __entry->addr = addr;
+                   __entry->pte = pte;
+                   ),
+
+           TP_printk("hugepage invalidate at addr 0x%lx and pte = 0x%lx",
+                     __entry->addr, __entry->pte)
+);
+
+TRACE_EVENT(hugepage_set_pmd,
+
+           TP_PROTO(unsigned long addr, unsigned long pmd),
+           TP_ARGS(addr, pmd),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   __field(unsigned long, pmd)
+                   ),
+
+           TP_fast_assign(
+                   __entry->addr = addr;
+                   __entry->pmd = pmd;
+                   ),
+
+           TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
+);
+
+
+TRACE_EVENT(hugepage_update,
+
+           TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set),
+           TP_ARGS(addr, pte, clr, set),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   __field(unsigned long, pte)
+                   __field(unsigned long, clr)
+                   __field(unsigned long, set)
+                   ),
+
+           TP_fast_assign(
+                   __entry->addr = addr;
+                   __entry->pte = pte;
+                   __entry->clr = clr;
+                   __entry->set = set;
+
+                   ),
+
+           TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
+);
+TRACE_EVENT(hugepage_splitting,
+
+           TP_PROTO(unsigned long addr, unsigned long pte),
+           TP_ARGS(addr, pte),
+           TP_STRUCT__entry(
+                   __field(unsigned long, addr)
+                   __field(unsigned long, pte)
+                   ),
+
+           TP_fast_assign(
+                   __entry->addr = addr;
+                   __entry->pte = pte;
+                   ),
+
+           TP_printk("hugepage splitting at addr 0x%lx and pte = 0x%lx",
+                     __entry->addr, __entry->pte)
+);
+
+#endif /* _TRACE_THP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 7a12e1c0f371d0ce9ede8cc524b40851f2c8b30f..02986cf8b6f12c41f651f896f75b5cabe5f36925 100644 (file)
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT        2
 
 /*
- * For flags member below
- * sg.h sg_io_hdr also has bits defined for it's flags member. However
- * none of these bits are implemented/used by bsg. The bits below are
- * allocated to not conflict with sg.h ones anyway.
+ * For flag constants below:
+ * sg.h sg_io_hdr also has bits defined for it's flags member. These
+ * two flag values (0x10 and 0x20) have the same meaning in sg.h . For
+ * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult.
  */
-#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_HEAD 0x20
 
 struct sg_io_v4 {
        __s32 guard;            /* [i] 'Q' to differentiate from v3 */
index 6d8e61c48563a41a56379643585d610cff7f096a..9ad67b2675847b237bab3ab0117a3fa449f68fd4 100644 (file)
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_WCE       9       /* Writeback mode enabled after reset */
 #define VIRTIO_BLK_F_TOPOLOGY  10      /* Topology information is available */
 #define VIRTIO_BLK_F_CONFIG_WCE        11      /* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ                12      /* support more than one vq */
 
 #ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
@@ -77,6 +78,10 @@ struct virtio_blk_config {
 
        /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
        __u8 wce;
+       __u8 unused;
+
+       /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+       __u16 num_queues;
 } __attribute__((packed));
 
 /*
index d6fce1cbdb906985ee89fe8556e6a205bc36ee6e..09f809f323eaaf211d50652cec037a93bbdb86a1 100644 (file)
@@ -191,6 +191,45 @@ struct ib_user_mad_reg_req {
        __u8    rmpp_version;
 };
 
+/**
+ * ib_user_mad_reg_req2 - MAD registration request
+ *
+ * @id                 - Set by the _kernel_; used by userspace to identify the
+ *                       registered agent in future requests.
+ * @qpn                - Queue pair number; must be 0 or 1.
+ * @mgmt_class         - Indicates which management class of MADs should be
+ *                       receive by the caller.  This field is only required if
+ *                       the user wishes to receive unsolicited MADs, otherwise
+ *                       it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *                       management class to receive.
+ * @res                - Ignored.
+ * @flags              - additional registration flags; Must be in the set of
+ *                       flags defined in IB_USER_MAD_REG_FLAGS_CAP
+ * @method_mask        - The caller wishes to receive unsolicited MADs for the
+ *                       methods whose bit(s) is(are) set.
+ * @oui                - Indicates IEEE OUI to use when mgmt_class is a vendor
+ *                       class in the range from 0x30 to 0x4f. Otherwise not
+ *                       used.
+ * @rmpp_version       - If set, indicates the RMPP version to use.
+ */
+enum {
+       IB_USER_MAD_USER_RMPP = (1 << 0),
+};
+#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP)
+struct ib_user_mad_reg_req2 {
+       __u32   id;
+       __u32   qpn;
+       __u8    mgmt_class;
+       __u8    mgmt_class_version;
+       __u16   res;
+       __u32   flags;
+       __u64   method_mask[2];
+       __u32   oui;
+       __u8    rmpp_version;
+       __u8    reserved[3];
+};
+
 #define IB_IOCTL_MAGIC         0x1b
 
 #define IB_USER_MAD_REGISTER_AGENT     _IOWR(IB_IOCTL_MAGIC, 1, \
@@ -200,4 +239,7 @@ struct ib_user_mad_reg_req {
 
 #define IB_USER_MAD_ENABLE_PKEY                _IO(IB_IOCTL_MAGIC, 3)
 
+#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
+                                             struct ib_user_mad_reg_req2)
+
 #endif /* IB_USER_MAD_H */
index cbfdd4ca951021854ff70843143ec8e1fabda358..26daf55ff76ead65f47801620d4697b10e17151c 100644 (file)
@@ -276,6 +276,22 @@ struct ib_uverbs_reg_mr_resp {
        __u32 rkey;
 };
 
+struct ib_uverbs_rereg_mr {
+       __u64 response;
+       __u32 mr_handle;
+       __u32 flags;
+       __u64 start;
+       __u64 length;
+       __u64 hca_va;
+       __u32 pd_handle;
+       __u32 access_flags;
+};
+
+struct ib_uverbs_rereg_mr_resp {
+       __u32 lkey;
+       __u32 rkey;
+};
+
 struct ib_uverbs_dereg_mr {
        __u32 mr_handle;
 };
index 99b80abf360afe3a4d94393df9ed52eb943d42ae..3066718eb12087ec22eb22e518a4f1756da33b21 100644 (file)
@@ -34,6 +34,7 @@
 #define RDMA_USER_CM_H
 
 #include <linux/types.h>
+#include <linux/socket.h>
 #include <linux/in6.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_user_sa.h>
index 44f9ed3dae2286a80792918b79c50870317410c4..e84c6423a2e5a2dbe80157b13f8b16d17c3e2d06 100644 (file)
@@ -268,7 +268,7 @@ config CROSS_MEMORY_ATTACH
        help
          Enabling this option adds the system calls process_vm_readv and
          process_vm_writev which allow a process with the correct privileges
-         to directly read from or write to to another process's address space.
+         to directly read from or write to another process' address space.
          See the man page for more details.
 
 config FHANDLE
index 1380d8ace334be7f7d6b49d760ed5c6013e4c7bb..0cf9cdb6e4919f254b32d04f6fdeb4943143a8d3 100644 (file)
@@ -1105,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p)
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
-       BUG_ON(!spin_is_locked(&current->sighand->siglock));
+       assert_spin_locked(&current->sighand->siglock);
 
        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
index de1a6bb6861db5e92f0884e2b733cb5b6988679e..e04c455a0e3860b2aa5784d4c2522a21300525e4 100644 (file)
@@ -272,6 +272,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
 
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+       return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+       return log_buf_len;
+}
+
 /* human readable text of the record */
 static char *log_text(const struct printk_log *msg)
 {
index 25b0043f47551319b6b7afdd9439b79c74a53e0a..44eb005c6695010e79a5041b837914a12c69aa45 100644 (file)
@@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall)
 
 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 {
-       BUG_ON(!spin_is_locked(&current->sighand->siglock));
+       assert_spin_locked(&current->sighand->siglock);
 
        if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
                return false;
@@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 static inline void seccomp_assign_mode(struct task_struct *task,
                                       unsigned long seccomp_mode)
 {
-       BUG_ON(!spin_is_locked(&task->sighand->siglock));
+       assert_spin_locked(&task->sighand->siglock);
 
        task->seccomp.mode = seccomp_mode;
        /*
@@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void)
        struct task_struct *thread, *caller;
 
        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
-       BUG_ON(!spin_is_locked(&current->sighand->siglock));
+       assert_spin_locked(&current->sighand->siglock);
 
        /* Validate all threads being eligible for synchronization. */
        caller = current;
@@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void)
        struct task_struct *thread, *caller;
 
        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
-       BUG_ON(!spin_is_locked(&current->sighand->siglock));
+       assert_spin_locked(&current->sighand->siglock);
 
        /* Synchronize all threads. */
        caller = current;
@@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags,
        unsigned long total_insns;
        struct seccomp_filter *walker;
 
-       BUG_ON(!spin_is_locked(&current->sighand->siglock));
+       assert_spin_locked(&current->sighand->siglock);
 
        /* Validate resulting filter length. */
        total_insns = filter->prog->len;
index f36b02838a4772a7ddaf4064e95a27ab22a6de3f..fb4a9c2cf8d98db2256a6268fb8bbfb37673b2fe 100644 (file)
@@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
 static inline void update_vsyscall(struct timekeeper *tk)
 {
-       struct timespec xt;
+       struct timespec xt, wm;
 
        xt = timespec64_to_timespec(tk_xtime(tk));
-       update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
+       wm = timespec64_to_timespec(tk->wall_to_monotonic);
+       update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
                            tk->tkr.cycle_last);
 }
 
index cb45f59685e69530caf6d5b11f14c2e37d01772c..07c28323f88fe99d08900d82e3eacec6ca88fb91 100644 (file)
@@ -143,6 +143,30 @@ config DEBUG_INFO_REDUCED
          DEBUG_INFO build and compile times are reduced too.
          Only works with newer gcc versions.
 
+config DEBUG_INFO_SPLIT
+       bool "Produce split debuginfo in .dwo files"
+       depends on DEBUG_INFO
+       help
+         Generate debug info into separate .dwo files. This significantly
+         reduces the build directory size for builds with DEBUG_INFO,
+         because it stores the information only once on disk in .dwo
+         files instead of multiple times in object files and executables.
+         In addition the debug information is also compressed.
+
+         Requires recent gcc (4.7+) and recent gdb/binutils.
+         Any tool that packages or reads debug information would need
+         to know about the .dwo files and include them.
+         Incompatible with older versions of ccache.
+
+config DEBUG_INFO_DWARF4
+       bool "Generate dwarf4 debuginfo"
+       depends on DEBUG_INFO
+       help
+         Generate dwarf4 debug info. This requires recent versions
+         of gcc and gdb. It makes the debug information larger.
+         But it significantly improves the success of resolving
+         variables in gdb on optimized code.
+
 config ENABLE_WARN_DEPRECATED
        bool "Enable __deprecated logic"
        default y
index 4a83ecd03650157d47ca3a68b8c6273f9dff73c8..852c81e3ba9a55bf17e2e4c3fe037c39040cc0f3 100644 (file)
@@ -169,7 +169,7 @@ out_fail:
        return NULL;
 }
 
-void lc_free_by_index(struct lru_cache *lc, unsigned i)
+static void lc_free_by_index(struct lru_cache *lc, unsigned i)
 {
        void *p = lc->lc_element[i];
        WARN_ON(!p);
@@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
  * lc_dump - Dump a complete LRU cache to seq in textual form.
  * @lc: the lru cache to operate on
  * @seq: the &struct seq_file pointer to seq_printf into
- * @utext: user supplied "heading" or other info
+ * @utext: user supplied additional "heading" or other info
  * @detail: function pointer the user may provide to dump further details
- * of the object the lc_element is embedded in.
+ * of the object the lc_element is embedded in. May be NULL.
+ * Note: a leading space ' ' and trailing newline '\n' is implied.
  */
 void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
             void (*detail) (struct seq_file *, struct lc_element *))
@@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
        struct lc_element *e;
        int i;
 
-       seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+       seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext);
        for (i = 0; i < nr_elements; i++) {
                e = lc_element_by_index(lc, i);
-               if (e->lc_number == LC_FREE) {
-                       seq_printf(seq, "\t%2d: FREE\n", i);
-               } else {
-                       seq_printf(seq, "\t%2d: %4u %4u    ", i,
-                                  e->lc_number, e->refcnt);
+               if (e->lc_number != e->lc_new_number)
+                       seq_printf(seq, "\t%5d: %6d %8d %6d ",
+                               i, e->lc_number, e->lc_new_number, e->refcnt);
+               else
+                       seq_printf(seq, "\t%5d: %6d %-8s %6d ",
+                               i, e->lc_number, "-\"-", e->refcnt);
+               if (detail)
                        detail(seq, e);
-               }
+               seq_putc(seq, '\n');
        }
 }
 
index e6940cf16628275ad7e1a58a7d1103da31f8f7d7..a2c78810ebc1a64a95903645dec6d14eed6ba1b6 100644 (file)
@@ -38,16 +38,10 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 #endif
 
-/**
- * rht_obj - cast hash head to outer object
- * @ht:                hash table
- * @he:                hashed node
- */
-void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
+static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 {
        return (void *) he - ht->p.head_offset;
 }
-EXPORT_SYMBOL_GPL(rht_obj);
 
 static u32 __hashfn(const struct rhashtable *ht, const void *key,
                      u32 len, u32 hsize)
@@ -386,7 +380,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * deletion when combined with walking or lookup.
  */
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-                            struct rhash_head **pprev, gfp_t flags)
+                            struct rhash_head __rcu **pprev, gfp_t flags)
 {
        struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
 
index 9aae6f47433f8a569145c4ae97e362f617784fd2..9eebfadeeee17f1f3650162dd1e34a88157ef39e 100644 (file)
@@ -275,6 +275,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
                ret = res_counter_memparse_write_strategy(buf, &val);
                if (ret)
                        break;
+               val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx]));
                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
                break;
        default:
index 4c5b8ba0f84febeab066010f4c4bd32652e93cd9..e4853b50cf402d9606c49daf34bc849467dab391 100644 (file)
@@ -833,7 +833,6 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
                          loff_t *l)
 {
        struct hlist_node *e = state->node;
-       struct lec_arp_table *tmp;
 
        if (!e)
                e = tbl->first;
@@ -842,9 +841,7 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
                --*l;
        }
 
-       tmp = container_of(e, struct lec_arp_table, next);
-
-       hlist_for_each_entry_from(tmp, next) {
+       for (; e; e = e->next) {
                if (--*l < 0)
                        break;
        }
index d8e5d0c2ebbc2acb9a084581f5fa9f99fbd904da..1ba23f5018e76199a7fb19be5079b512822f3dab 100644 (file)
@@ -50,12 +50,12 @@ static void svc_disconnect(struct atm_vcc *vcc)
 
        pr_debug("%p\n", vcc);
        if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
-               prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
                sigd_enq(vcc, as_close, NULL, NULL, NULL);
-               while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+               for (;;) {
+                       prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+                       if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd)
+                               break;
                        schedule();
-                       prepare_to_wait(sk_sleep(sk), &wait,
-                                       TASK_UNINTERRUPTIBLE);
                }
                finish_wait(sk_sleep(sk), &wait);
        }
@@ -126,11 +126,12 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
        }
        vcc->local = *addr;
        set_bit(ATM_VF_WAITING, &vcc->flags);
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
        sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
-       while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-               schedule();
+       for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+               if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+                       break;
+               schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
        clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
@@ -202,15 +203,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
                }
                vcc->remote = *addr;
                set_bit(ATM_VF_WAITING, &vcc->flags);
-               prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
                if (flags & O_NONBLOCK) {
-                       finish_wait(sk_sleep(sk), &wait);
                        sock->state = SS_CONNECTING;
                        error = -EINPROGRESS;
                        goto out;
                }
                error = 0;
+               prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
                        schedule();
                        if (!signal_pending(current)) {
@@ -297,11 +297,12 @@ static int svc_listen(struct socket *sock, int backlog)
                goto out;
        }
        set_bit(ATM_VF_WAITING, &vcc->flags);
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
        sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
-       while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-               schedule();
+       for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+               if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+                       break;
+               schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
        if (!sigd) {
@@ -387,15 +388,15 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
                }
                /* wait should be short, so we ignore the non-blocking flag */
                set_bit(ATM_VF_WAITING, &new_vcc->flags);
-               prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
-                               TASK_UNINTERRUPTIBLE);
                sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
-               while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
+               for (;;) {
+                       prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd)
+                               break;
                        release_sock(sk);
                        schedule();
                        lock_sock(sk);
-                       prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
-                                       TASK_UNINTERRUPTIBLE);
                }
                finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
                if (!sigd) {
@@ -433,12 +434,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
        DEFINE_WAIT(wait);
 
        set_bit(ATM_VF_WAITING, &vcc->flags);
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
        sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
-       while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
-              !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
-               schedule();
+       for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+               if (!test_bit(ATM_VF_WAITING, &vcc->flags) ||
+                   test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) {
+                       break;
+               }
+               schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
        if (!sigd)
@@ -529,18 +532,18 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
 
        lock_sock(sk);
        set_bit(ATM_VF_WAITING, &vcc->flags);
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
        sigd_enq(vcc, as_addparty, NULL, NULL,
                 (struct sockaddr_atmsvc *) sockaddr);
        if (flags & O_NONBLOCK) {
-               finish_wait(sk_sleep(sk), &wait);
                error = -EINPROGRESS;
                goto out;
        }
        pr_debug("added wait queue\n");
-       while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-               schedule();
+       for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+               if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+                       break;
+               schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
        error = xchg(&sk->sk_err_soft, 0);
@@ -558,11 +561,12 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
 
        lock_sock(sk);
        set_bit(ATM_VF_WAITING, &vcc->flags);
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
        sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
-       while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-               schedule();
+       for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+               if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+                       break;
+               schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
        if (!sigd) {
index 181b70ebd9641edb54afb158722e95272e4e753d..541f26a67ba28861c882f819369baf2ad2cd4724 100644 (file)
@@ -1187,13 +1187,6 @@ new_segment:
                                if (!skb)
                                        goto wait_for_memory;
 
-                               /*
-                                * All packets are restored as if they have
-                                * already been sent.
-                                */
-                               if (tp->repair)
-                                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
                                /*
                                 * Check whether we can use HW checksum.
                                 */
@@ -1203,6 +1196,13 @@ new_segment:
                                skb_entail(sk, skb);
                                copy = size_goal;
                                max = size_goal;
+
+                               /* All packets are restored as if they have
+                                * already been sent. skb_mstamp isn't set to
+                                * avoid wrong rtt estimation.
+                                */
+                               if (tp->repair)
+                                       TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
                        }
 
                        /* Try to append data to the end of skb. */
index a3d47af01906d4436d01703a499530a4825c4a7b..a906e0200ff26144727a1c6c90718cf1fa492dde 100644 (file)
@@ -2687,7 +2687,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  */
 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 {
-       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        bool recovered = !before(tp->snd_una, tp->high_seq);
 
@@ -2713,12 +2712,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 
        if (recovered) {
                /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
-               icsk->icsk_retransmits = 0;
                tcp_try_undo_recovery(sk);
                return;
        }
-       if (flag & FLAG_DATA_ACKED)
-               icsk->icsk_retransmits = 0;
        if (tcp_is_reno(tp)) {
                /* A Reno DUPACK means new data in F-RTO step 2.b above are
                 * delivered. Lower inflight to clock out (re)tranmissions.
@@ -3050,10 +3046,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        first_ackt.v64 = 0;
 
        while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+               struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                u8 sacked = scb->sacked;
                u32 acked_pcount;
 
+               if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+                   between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
                /* Determine how many packets and what bytes were acked, tso and else */
                if (after(scb->end_seq, tp->snd_una)) {
                        if (tcp_skb_pcount(skb) == 1 ||
@@ -3107,11 +3108,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                        tp->retrans_stamp = 0;
                }
 
-               if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
-                   between(skb_shinfo(skb)->tskey, prior_snd_una,
-                           tp->snd_una + 1))
-                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
-
                if (!fully_acked)
                        break;
 
@@ -3405,8 +3401,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
 
-       if (after(ack, prior_snd_una))
+       if (after(ack, prior_snd_una)) {
                flag |= FLAG_SND_UNA_ADVANCED;
+               icsk->icsk_retransmits = 0;
+       }
 
        prior_fackets = tp->fackets_out;
 
@@ -5979,12 +5977,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                 * timewait bucket, so that all the necessary checks
                 * are made in the function processing timewait state.
                 */
-               if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+               if (tcp_death_row.sysctl_tw_recycle) {
                        bool strict;
 
                        dst = af_ops->route_req(sk, &fl, req, &strict);
+
                        if (dst && strict &&
-                           !tcp_peer_is_proven(req, dst, true)) {
+                           !tcp_peer_is_proven(req, dst, true,
+                                               tmp_opt.saw_tstamp)) {
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                                goto drop_and_release;
                        }
@@ -5993,7 +5993,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                else if (!sysctl_tcp_syncookies &&
                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                          (sysctl_max_syn_backlog >> 2)) &&
-                        !tcp_peer_is_proven(req, dst, false)) {
+                        !tcp_peer_is_proven(req, dst, false,
+                                            tmp_opt.saw_tstamp)) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
index dceff5fe8e66c10cab62e093a896a03549cac2ed..cd17f009aede03524f741b84a413478b2d38ef7c 100644 (file)
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
  * It can be called through tcp_release_cb() if socket was owned by user
  * at the time tcp_v4_err() was called to handle ICMP message.
  */
-static void tcp_v4_mtu_reduced(struct sock *sk)
+void tcp_v4_mtu_reduced(struct sock *sk)
 {
        struct dst_entry *dst;
        struct inet_sock *inet = inet_sk(sk);
@@ -302,6 +302,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
                tcp_simple_retransmit(sk);
        } /* else let the usual retransmit timer handle it */
 }
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 
 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 {
@@ -1787,6 +1788,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
        .compat_setsockopt = compat_ip_setsockopt,
        .compat_getsockopt = compat_ip_getsockopt,
 #endif
+       .mtu_reduced       = tcp_v4_mtu_reduced,
 };
 EXPORT_SYMBOL(ipv4_specific);
 
@@ -2406,7 +2408,6 @@ struct proto tcp_prot = {
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v4_do_rcv,
        .release_cb             = tcp_release_cb,
-       .mtu_reduced            = tcp_v4_mtu_reduced,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
index 0d54e59b9ea8c02835674b47e06c8732295ab10c..ed9c9a91851ced8e719aeec58530fb326ec8a2db 100644 (file)
@@ -576,7 +576,8 @@ reset:
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+                       bool paws_check, bool timestamps)
 {
        struct tcp_metrics_block *tm;
        bool ret;
@@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa
        if (paws_check) {
                if (tm &&
                    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
-                   (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+                   ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+                    !timestamps))
                        ret = false;
                else
                        ret = true;
index 8fcfc91964ecb5226d65ef24974eee70fe225bbc..5a7c41fbc6d3399686ff47ab5102ef7cc1a9ea42 100644 (file)
@@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
                __sock_put(sk);
        }
        if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
-               sk->sk_prot->mtu_reduced(sk);
+               inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
                __sock_put(sk);
        }
 }
@@ -1069,6 +1069,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
        tcp_verify_left_out(tp);
 }
 
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+       if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+           !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+               struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+               u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+               shinfo->tx_flags &= ~tsflags;
+               shinfo2->tx_flags |= tsflags;
+               swap(shinfo->tskey, shinfo2->tskey);
+       }
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -1136,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
        buff->tstamp = skb->tstamp;
+       tcp_fragment_tstamp(skb, buff);
 
        old_factor = tcp_skb_pcount(skb);
 
@@ -1652,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
        buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
        skb_split(skb, buff, len);
+       tcp_fragment_tstamp(skb, buff);
 
        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(sk, skb, mss_now);
@@ -1917,8 +1934,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
                BUG_ON(!tso_segs);
 
-               if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+               if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+                       /* "when" is used as a start point for the retransmit timer */
+                       TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        goto repair; /* Skip network transmission */
+               }
 
                cwnd_quota = tcp_cwnd_test(tp, skb);
                if (!cwnd_quota) {
index 2e9ba035fb5f51f33e7a1b13416064e57f8cb855..6163f851dc014ebd205211829214ce4d0bbc332b 100644 (file)
@@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
        for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr &&
-                   (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+                   (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
                    (t->dev->flags & IFF_UP))
                        return t;
        }
        for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
                if (remote == t->parms.iph.daddr &&
-                   (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+                   (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
                    (t->dev->flags & IFF_UP))
                        return t;
        }
        for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
                if (local == t->parms.iph.saddr &&
-                   (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+                   (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
                    (t->dev->flags & IFF_UP))
                        return t;
        }
index f2ce95502392c0e685654df5792614118626f316..29964c3d363c8a0e741b01b9df32fce3cdd7a1ba 100644 (file)
@@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
        .compat_setsockopt = compat_ipv6_setsockopt,
        .compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+       .mtu_reduced       = tcp_v6_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
        .compat_setsockopt = compat_ipv6_setsockopt,
        .compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+       .mtu_reduced       = tcp_v4_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = {
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v6_do_rcv,
        .release_cb             = tcp_release_cb,
-       .mtu_reduced            = tcp_v6_mtu_reduced,
        .hash                   = tcp_v6_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
index 9ea0c933b9ff8803c071367bfef0df210090a90f..a37998c6273d163eeb2cbbbce4182aea82bee418 100644 (file)
@@ -622,7 +622,7 @@ void irlap_send_rd_frame(struct irlap_cb *self)
        frame = (struct rd_frame *)skb_put(tx_skb, 2);
 
        frame->caddr = self->caddr;
-       frame->caddr = RD_RSP | PF_BIT;
+       frame->control = RD_RSP | PF_BIT;
 
        irlap_queue_xmit(self, tx_skb);
 }
index 2e152e5f218660e94ce2bed0d45925d9676e9bec..c416725d28c49f8b0c1b10bbf35a28594c646bc5 100644 (file)
@@ -2921,6 +2921,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(RCU)
 {
        rcu_read_lock();
        return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -2970,6 +2971,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
+       __releases(RCU)
 {
        rcu_read_unlock();
 }
index 702fb21bfe15bd6f3b0deb1678ccaf21d27f1b71..6d8f2ec481d9d33927795cfd3f4a59bb7258f128 100644 (file)
@@ -137,8 +137,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
        vport->ops = ops;
        INIT_HLIST_NODE(&vport->dp_hash_node);
 
-       if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids))
+       if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) {
+               kfree(vport);
                return ERR_PTR(-EINVAL);
+       }
 
        vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
        if (!vport->percpu_stats) {
index 122f95c958693c8b784e0b2e3e6409ab23d3e561..8a9a4e1c7eab6757f16daacd736529784881f39e 100644 (file)
@@ -215,11 +215,13 @@ else
 arg-check = $(if $(strip $(cmd_$@)),,1)
 endif
 
-# >'< substitution is for echo to work,
-# >$< substitution to preserve $ when reloading .cmd file
-# note: when using inline perl scripts [perl -e '...$$t=1;...']
-# in $(cmd_xxx) double $$ your perl vars
-make-cmd = $(subst \\,\\\\,$(subst \#,\\\#,$(subst $$,$$$$,$(call escsq,$(cmd_$(1))))))
+# Replace >$< with >$$< to preserve $ when reloading the .cmd file
+# (needed for make)
+# Replace >#< with >\#< to avoid starting a comment in the .cmd file
+# (needed for make)
+# Replace >'< with >'\''< to be able to enclose the whole string in '...'
+# (needed for the shell)
+make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
 
 # Find any prerequisites that is newer than target or that does not exist.
 # PHONY targets skipped in both cases.
@@ -230,7 +232,7 @@ any-prereq = $(filter-out $(PHONY),$?) $(filter-out $(PHONY) $(wildcard $^),$^)
 if_changed = $(if $(strip $(any-prereq) $(arg-check)),                       \
        @set -e;                                                             \
        $(echo-cmd) $(cmd_$(1));                                             \
-       echo 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd)
+       printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd)
 
 # Execute the command and also postprocess generated .d dependencies file.
 if_changed_dep = $(if $(strip $(any-prereq) $(arg-check) ),                  \
index 686cb0d31c7c956943693dd0d517d878cf99b7a0..a651cee84f2a837ba2212bb82aa15433cb78ac90 100644 (file)
@@ -40,8 +40,8 @@ subdir-ymn    := $(addprefix $(obj)/,$(subdir-ymn))
 # build a list of files to remove, usually relative to the current
 # directory
 
-__clean-files  := $(extra-y) $(always)                  \
-                  $(targets) $(clean-files)             \
+__clean-files  := $(extra-y) $(extra-m) $(extra-)       \
+                  $(always) $(targets) $(clean-files)   \
                   $(host-progs)                         \
                   $(hostprogs-y) $(hostprogs-m) $(hostprogs-)
 
index 65643506c71c7c03c9f6ad82944f955a14109815..f734033af219d267fd67428432feb9bacee0baae 100644 (file)
@@ -26,16 +26,6 @@ warning-1 += $(call cc-option, -Wmissing-include-dirs)
 warning-1 += $(call cc-option, -Wunused-but-set-variable)
 warning-1 += $(call cc-disable-warning, missing-field-initializers)
 
-# Clang
-warning-1 += $(call cc-disable-warning, initializer-overrides)
-warning-1 += $(call cc-disable-warning, unused-value)
-warning-1 += $(call cc-disable-warning, format)
-warning-1 += $(call cc-disable-warning, unknown-warning-option)
-warning-1 += $(call cc-disable-warning, sign-compare)
-warning-1 += $(call cc-disable-warning, format-zero-length)
-warning-1 += $(call cc-disable-warning, uninitialized)
-warning-1 += $(call cc-option, -fcatch-undefined-behavior)
-
 warning-2 := -Waggregate-return
 warning-2 += -Wcast-align
 warning-2 += -Wdisabled-optimization
@@ -64,4 +54,15 @@ ifeq ("$(strip $(warning))","")
 endif
 
 KBUILD_CFLAGS += $(warning)
+else
+
+ifeq ($(COMPILER),clang)
+KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-value)
+KBUILD_CFLAGS += $(call cc-disable-warning, format)
+KBUILD_CFLAGS += $(call cc-disable-warning, unknown-warning-option)
+KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare)
+KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length)
+KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized)
+endif
 endif
index 66893643fd7d14a6f59bb52a17b0393180e7051d..ab5980f917141e9f90f52a9ae49e71901104196f 100644 (file)
 # Will compile qconf as a C++ program, and menu as a C program.
 # They are linked as C++ code to the executable qconf
 
-# hostprogs-y := conf
-# conf-objs  := conf.o libkconfig.so
-# libkconfig-objs := expr.o type.o
-# Will create a shared library named libkconfig.so that consists of
-# expr.o and type.o (they are both compiled as C code and the object files
-# are made as position independent code).
-# conf.c is compiled as a C program, and conf.o is linked together with
-# libkconfig.so as the executable conf.
-# Note: Shared libraries consisting of C++ files are not supported
-
 __hostprogs := $(sort $(hostprogs-y) $(hostprogs-m))
 
 # C code
 # Executables compiled from a single .c file
-host-csingle   := $(foreach m,$(__hostprogs),$(if $($(m)-objs),,$(m)))
+host-csingle   := $(foreach m,$(__hostprogs), \
+                       $(if $($(m)-objs)$($(m)-cxxobjs),,$(m)))
 
 # C executables linked based on several .o files
 host-cmulti    := $(foreach m,$(__hostprogs),\
@@ -44,33 +35,17 @@ host-cmulti := $(foreach m,$(__hostprogs),\
 host-cobjs     := $(sort $(foreach m,$(__hostprogs),$($(m)-objs)))
 
 # C++ code
-# C++ executables compiled from at least on .cc file
+# C++ executables compiled from at least one .cc file
 # and zero or more .c files
 host-cxxmulti  := $(foreach m,$(__hostprogs),$(if $($(m)-cxxobjs),$(m)))
 
 # C++ Object (.o) files compiled from .cc files
 host-cxxobjs   := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs)))
 
-# Shared libaries (only .c supported)
-# Shared libraries (.so) - all .so files referenced in "xxx-objs"
-host-cshlib    := $(sort $(filter %.so, $(host-cobjs)))
-# Remove .so files from "xxx-objs"
-host-cobjs     := $(filter-out %.so,$(host-cobjs))
-
-#Object (.o) files used by the shared libaries
-host-cshobjs   := $(sort $(foreach m,$(host-cshlib),$($(m:.so=-objs))))
-
 # output directory for programs/.o files
-# hostprogs-y := tools/build may have been specified. Retrieve directory
-host-objdirs := $(foreach f,$(__hostprogs), $(if $(dir $(f)),$(dir $(f))))
-# directory of .o files from prog-objs notation
-host-objdirs += $(foreach f,$(host-cmulti),                  \
-                    $(foreach m,$($(f)-objs),                \
-                        $(if $(dir $(m)),$(dir $(m)))))
-# directory of .o files from prog-cxxobjs notation
-host-objdirs += $(foreach f,$(host-cxxmulti),                  \
-                    $(foreach m,$($(f)-cxxobjs),                \
-                        $(if $(dir $(m)),$(dir $(m)))))
+# hostprogs-y := tools/build may have been specified.
+# Retrieve also directory of .o files from prog-objs or prog-cxxobjs notation
+host-objdirs := $(dir $(__hostprogs) $(host-cobjs) $(host-cxxobjs))
 
 host-objdirs := $(strip $(sort $(filter-out ./,$(host-objdirs))))
 
@@ -81,8 +56,6 @@ host-cmulti   := $(addprefix $(obj)/,$(host-cmulti))
 host-cobjs     := $(addprefix $(obj)/,$(host-cobjs))
 host-cxxmulti  := $(addprefix $(obj)/,$(host-cxxmulti))
 host-cxxobjs   := $(addprefix $(obj)/,$(host-cxxobjs))
-host-cshlib    := $(addprefix $(obj)/,$(host-cshlib))
-host-cshobjs   := $(addprefix $(obj)/,$(host-cshobjs))
 host-objdirs    := $(addprefix $(obj)/,$(host-objdirs))
 
 obj-dirs += $(host-objdirs)
@@ -123,7 +96,7 @@ quiet_cmd_host-cmulti        = HOSTLD  $@
       cmd_host-cmulti  = $(HOSTCC) $(HOSTLDFLAGS) -o $@ \
                          $(addprefix $(obj)/,$($(@F)-objs)) \
                          $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
-$(host-cmulti): $(obj)/%: $(host-cobjs) $(host-cshlib) FORCE
+$(host-cmulti): $(obj)/%: $(host-cobjs) FORCE
        $(call if_changed,host-cmulti)
 
 # Create .o file from a single .c file
@@ -140,7 +113,7 @@ quiet_cmd_host-cxxmulti     = HOSTLD  $@
                          $(foreach o,objs cxxobjs,\
                          $(addprefix $(obj)/,$($(@F)-$(o)))) \
                          $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
-$(host-cxxmulti): $(obj)/%: $(host-cobjs) $(host-cxxobjs) $(host-cshlib) FORCE
+$(host-cxxmulti): $(obj)/%: $(host-cobjs) $(host-cxxobjs) FORCE
        $(call if_changed,host-cxxmulti)
 
 # Create .o file from a single .cc (C++) file
@@ -149,21 +122,5 @@ quiet_cmd_host-cxxobjs     = HOSTCXX $@
 $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE
        $(call if_changed_dep,host-cxxobjs)
 
-# Compile .c file, create position independent .o file
-# host-cshobjs -> .o
-quiet_cmd_host-cshobjs = HOSTCC  -fPIC $@
-      cmd_host-cshobjs = $(HOSTCC) $(hostc_flags) -fPIC -c -o $@ $<
-$(host-cshobjs): $(obj)/%.o: $(src)/%.c FORCE
-       $(call if_changed_dep,host-cshobjs)
-
-# Link a shared library, based on position independent .o files
-# *.o -> .so shared library (host-cshlib)
-quiet_cmd_host-cshlib  = HOSTLLD -shared $@
-      cmd_host-cshlib  = $(HOSTCC) $(HOSTLDFLAGS) -shared -o $@ \
-                         $(addprefix $(obj)/,$($(@F:.so=-objs))) \
-                         $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
-$(host-cshlib): $(obj)/%: $(host-cshobjs) FORCE
-       $(call if_changed,host-cshlib)
-
 targets += $(host-csingle)  $(host-cmulti) $(host-cobjs)\
-          $(host-cxxmulti) $(host-cxxobjs) $(host-cshlib) $(host-cshobjs)
+          $(host-cxxmulti) $(host-cxxobjs)
diff --git a/scripts/coccinelle/api/alloc/alloc_cast.cocci b/scripts/coccinelle/api/alloc/alloc_cast.cocci
new file mode 100644 (file)
index 0000000..6c308ee
--- /dev/null
@@ -0,0 +1,72 @@
+/// Remove casting the values returned by memory allocation functions
+/// like kmalloc, kzalloc, kmem_cache_alloc, kmem_cache_zalloc etc.
+///
+//# This makes an effort to find cases of casting of values returned by
+//# kmalloc, kzalloc, kcalloc, kmem_cache_alloc, kmem_cache_zalloc,
+//# kmem_cache_alloc_node, kmalloc_node and kzalloc_node and removes
+//# the casting as it is not required. The result in the patch case may
+//#need some reformatting.
+//
+// Confidence: High
+// Copyright: 2014, Himangi Saraogi  GPLv2.
+// Comments:
+// Options: --no-includes --include-headers
+//
+
+virtual context
+virtual patch
+virtual org
+virtual report
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on context@
+type T;
+@@
+
+* (T *)
+  \(kmalloc\|kzalloc\|kcalloc\|kmem_cache_alloc\|kmem_cache_zalloc\|
+   kmem_cache_alloc_node\|kmalloc_node\|kzalloc_node\)(...)
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@depends on patch@
+type T;
+@@
+
+- (T *)
+  (\(kmalloc\|kzalloc\|kcalloc\|kmem_cache_alloc\|kmem_cache_zalloc\|
+   kmem_cache_alloc_node\|kmalloc_node\|kzalloc_node\)(...))
+
+//----------------------------------------------------------
+//  For org and report mode
+//----------------------------------------------------------
+
+@r depends on org || report@
+type T;
+position p;
+@@
+
+ (T@p *)\(kmalloc\|kzalloc\|kcalloc\|kmem_cache_alloc\|kmem_cache_zalloc\|
+   kmem_cache_alloc_node\|kmalloc_node\|kzalloc_node\)(...)
+
+@script:python depends on org@
+p << r.p;
+t << r.T;
+@@
+
+coccilib.org.print_safe_todo(p[0], t)
+
+@script:python depends on report@
+p << r.p;
+t << r.T;
+@@
+
+msg="WARNING: casting value returned by memory allocation function to (%s *) is useless." % (t)
+coccilib.report.print_report(p[0], msg)
+
+
diff --git a/scripts/coccinelle/misc/array_size.cocci b/scripts/coccinelle/misc/array_size.cocci
new file mode 100644 (file)
index 0000000..81e279c
--- /dev/null
@@ -0,0 +1,87 @@
+/// Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element
+///
+//# This makes an effort to find cases where ARRAY_SIZE can be used such as
+//# where there is a division of sizeof the array by the sizeof its first
+//# element or by any indexed element or the element type. It replaces the
+//# division of the two sizeofs by ARRAY_SIZE.
+//
+// Confidence: High
+// Copyright: (C) 2014 Himangi Saraogi.  GPLv2.
+// Comments:
+// Options: --no-includes --include-headers
+
+virtual patch
+virtual context
+virtual org
+virtual report
+
+@i@
+@@
+
+#include <linux/kernel.h>
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on i&&context@
+type T;
+T[] E;
+@@
+(
+* (sizeof(E)/sizeof(*E))
+|
+* (sizeof(E)/sizeof(E[...]))
+|
+* (sizeof(E)/sizeof(T))
+)
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@depends on i&&patch@
+type T;
+T[] E;
+@@
+(
+- (sizeof(E)/sizeof(*E))
++ ARRAY_SIZE(E)
+|
+- (sizeof(E)/sizeof(E[...]))
++ ARRAY_SIZE(E)
+|
+- (sizeof(E)/sizeof(T))
++ ARRAY_SIZE(E)
+)
+
+//----------------------------------------------------------
+//  For org and report mode
+//----------------------------------------------------------
+
+@r@
+type T;
+T[] E;
+position p;
+@@
+(
+ (sizeof(E)@p /sizeof(*E))
+|
+ (sizeof(E)@p /sizeof(E[...]))
+|
+ (sizeof(E)@p /sizeof(T))
+)
+
+@script:python depends on i&&org@
+p << r.p;
+@@
+
+coccilib.org.print_todo(p[0], "WARNING should use ARRAY_SIZE")
+
+@script:python depends on i&&report@
+p << r.p;
+@@
+
+msg="WARNING: Use ARRAY_SIZE"
+coccilib.report.print_report(p[0], msg)
+
diff --git a/scripts/coccinelle/misc/badty.cocci b/scripts/coccinelle/misc/badty.cocci
new file mode 100644 (file)
index 0000000..2fc06fc
--- /dev/null
@@ -0,0 +1,76 @@
+/// Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element
+///
+//# This makes an effort to find cases where the argument to sizeof is wrong
+//# in memory allocation functions by checking the type of the allocated memory
+//# when it is a double pointer and ensuring the sizeof argument takes a pointer
+//# to the the memory being allocated. There are false positives in cases the
+//# sizeof argument is not used in constructing the return value. The result
+//# may need some reformatting.
+//
+// Confidence: Moderate
+// Copyright: (C) 2014 Himangi Saraogi.  GPLv2.
+// Comments:
+// Options:
+
+virtual patch
+virtual context
+virtual org
+virtual report
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on context disable sizeof_type_expr@
+type T;
+T **x;
+@@
+
+  x =
+  <+...sizeof(
+* T
+  )...+>
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@depends on patch disable sizeof_type_expr@
+type T;
+T **x;
+@@
+
+  x =
+  <+...sizeof(
+- T
++ *x
+  )...+>
+
+//----------------------------------------------------------
+//  For org and report mode
+//----------------------------------------------------------
+
+@r disable sizeof_type_expr@
+type T;
+T **x;
+position p;
+@@
+
+  x =
+  <+...sizeof(
+  T@p
+  )...+>
+
+@script:python depends on org@
+p << r.p;
+@@
+
+coccilib.org.print_todo(p[0], "WARNING sizeof argument should be pointer type, not structure type")
+
+@script:python depends on report@
+p << r.p;
+@@
+
+msg="WARNING: Use correct pointer type argument for sizeof"
+coccilib.report.print_report(p[0], msg)
+
similarity index 50%
rename from scripts/coccinelle/api/alloc/drop_kmalloc_cast.cocci
rename to scripts/coccinelle/misc/bugon.cocci
index bd5d08b882ee9ccda1d79da6f596e34f0ec12ef1..556456ca761c0035b314b579c9c38d5b38c3b62c 100644 (file)
@@ -1,20 +1,17 @@
+/// Use BUG_ON instead of a if condition followed by BUG.
 ///
-/// Casting (void *) value returned by kmalloc is useless
-/// as mentioned in Documentation/CodingStyle, Chap 14.
-///
-// Confidence: High
-// Copyright: 2009,2010 Nicolas Palix, DIKU.  GPLv2.
-// URL: http://coccinelle.lip6.fr/
-// Options: --no-includes --include-headers
-//
-// Keywords: kmalloc, kzalloc, kcalloc
-// Version min: < 2.6.12 kmalloc
-// Version min: < 2.6.12 kcalloc
-// Version min:   2.6.14 kzalloc
+//# This makes an effort to find cases where BUG() follows an if
+//# condition on an expression and replaces the if condition and BUG()
+//# with a BUG_ON having the conditional expression of the if statement
+//# as argument.
 //
+// Confidence: High
+// Copyright: (C) 2014 Himangi Saraogi.  GPLv2.
+// Comments:
+// Options: --no-includes, --include-headers
 
-virtual context
 virtual patch
+virtual context
 virtual org
 virtual report
 
@@ -23,45 +20,43 @@ virtual report
 //----------------------------------------------------------
 
 @depends on context@
-type T;
+expression e;
 @@
 
-* (T *)
-  \(kmalloc\|kzalloc\|kcalloc\)(...)
+*if (e) BUG();
 
 //----------------------------------------------------------
 //  For patch mode
 //----------------------------------------------------------
 
 @depends on patch@
-type T;
+expression e;
 @@
 
-- (T *)
-  \(kmalloc\|kzalloc\|kcalloc\)(...)
+-if (e) BUG();
++BUG_ON(e);
 
 //----------------------------------------------------------
 //  For org and report mode
 //----------------------------------------------------------
 
-@r depends on org || report@
-type T;
+@r@
+expression e;
 position p;
 @@
 
- (T@p *)\(kmalloc\|kzalloc\|kcalloc\)(...)
+ if (e) BUG@p ();
 
 @script:python depends on org@
 p << r.p;
-t << r.T;
 @@
 
-coccilib.org.print_safe_todo(p[0], t)
+coccilib.org.print_todo(p[0], "WARNING use BUG_ON")
 
 @script:python depends on report@
 p << r.p;
-t << r.T;
 @@
 
-msg="WARNING: casting value returned by k[cmz]alloc to (%s *) is useless." % (t)
+msg="WARNING: Use BUG_ON"
 coccilib.report.print_report(p[0], msg)
+
index d79baf7220e778a430c8485bfc8f18b6d9587d19..5551da2b4fe32baaea72e0ec84c1a5b024756535 100644 (file)
@@ -10,7 +10,7 @@
 // Copyright: (C) 2012 Julia Lawall, INRIA/LIP6.  GPLv2.
 // Copyright: (C) 2012 Gilles Muller, INRIA/LiP6.  GPLv2.
 // URL: http://coccinelle.lip6.fr/
-// Comments:
+// Comments: Requires Coccinelle version 1.0.0-rc20 or later
 // Options:
 
 virtual patch
@@ -19,6 +19,7 @@ virtual org
 virtual report
 
 @initialize:ocaml@
+@@
 let negtable = Hashtbl.create 101
 
 @depends on patch@
index 7a43c0c38316c7879815cf42190cd64f51376373..8a431bcb056cecc5414ba30e93c238fce7288df9 100644 (file)
@@ -992,9 +992,9 @@ static int snd_pmac_detect(struct snd_pmac *chip)
                return -ENODEV;
 
        if (!sound) {
-               sound = of_find_node_by_name(NULL, "sound");
-               while (sound && sound->parent != chip->node)
-                       sound = of_find_node_by_name(sound, "sound");
+               for_each_node_by_name(sound, "sound")
+                       if (sound->parent == chip->node)
+                               break;
        }
        if (! sound) {
                of_node_put(chip->node);